pax_global_header00006660000000000000000000000064141650023720014512gustar00rootroot0000000000000052 comment=b91df1a02ea2737eddaa6c7e9bd5b6e58c527ea6 libyuv-0.0~git20220104.b91df1a/000077500000000000000000000000001416500237200155265ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/.clang-format000066400000000000000000000002621416500237200201010ustar00rootroot00000000000000# Defines the Chromium style for automatic reformatting. # http://clang.llvm.org/docs/ClangFormatStyleOptions.html BasedOnStyle: Chromium --- Language: Java BasedOnStyle: Google libyuv-0.0~git20220104.b91df1a/.gitignore000066400000000000000000000006641416500237200175240ustar00rootroot00000000000000*.pyc .landmines pin-log.txt /base /build /buildtools /google_apis /links /links.db /ios /mojo /native_client /net /out /source/out /sde-avx-sse-transition-out.txt /testing /third_party /tools # Files generated by CMake build cmake_install.cmake CMakeCache.txt CMakeFiles/ yuvconvert libgtest.a libyuv.a libyuv_unittest # Files generated by winarm.mk build libyuv_arm.lib source/*.o # Files generated by perf perf.data perf.data.old libyuv-0.0~git20220104.b91df1a/.gn000066400000000000000000000031521416500237200161340ustar00rootroot00000000000000# Copyright 2015 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. import("//build/dotfile_settings.gni") # The location of the build configuration file. buildconfig = "//build/config/BUILDCONFIG.gn" # The python interpreter to use by default. On Windows, this will look # for python3.exe and python3.bat. script_executable = "python3" # The secondary source root is a parallel directory tree where # GN build files are placed when they can not be placed directly # in the source tree, e.g. for third party source trees. secondary_source = "//build/secondary/" # These are the targets to check headers for by default. The files in targets # matching these patterns (see "gn help label_pattern" for format) will have # their includes checked for proper dependencies when you run either # "gn check" or "gn gen --check". check_targets = [ "//libyuv/*" ] # These are the list of GN files that run exec_script. This whitelist exists # to force additional review for new uses of exec_script, which is strongly # discouraged except for gypi_to_gn calls. exec_script_whitelist = build_dotfile_settings.exec_script_whitelist + [ "//build_overrides/build.gni" ] default_args = { mac_sdk_min = "10.12" # https://bugs.chromium.org/p/libyuv/issues/detail?id=826 ios_deployment_target = "10.0" } libyuv-0.0~git20220104.b91df1a/.vpython000066400000000000000000000033271416500237200172430ustar00rootroot00000000000000# This is a vpython "spec" file. # # It describes patterns for python wheel dependencies of the python scripts in # the chromium repo, particularly for dependencies that have compiled components # (since pure-python dependencies can be easily vendored into third_party). # # When vpython is invoked, it finds this file and builds a python VirtualEnv, # containing all of the dependencies described in this file, fetching them from # CIPD (the "Chrome Infrastructure Package Deployer" service). Unlike `pip`, # this never requires the end-user machine to have a working python extension # compilation environment. All of these packages are built using: # https://chromium.googlesource.com/infra/infra/+/master/infra/tools/dockerbuild/ # # All python scripts in the repo share this same spec, to avoid dependency # fragmentation. # # If you have depot_tools installed in your $PATH, you can invoke python scripts # in this repo by running them as you normally would run them, except # substituting `vpython` instead of `python` on the command line, e.g.: # vpython path/to/script.py some --arguments # # Read more about `vpython` and how to modify this file here: # https://chromium.googlesource.com/infra/infra/+/master/doc/users/vpython.md python_version: "2.7" # Used by: # third_party/catapult wheel: < name: "infra/python/wheels/psutil/${platform}_${py_python}_${py_abi}" version: "version:5.2.2" > # Used by: # third_party/catapult wheel: < name: "infra/python/wheels/pypiwin32/${vpython_platform}" version: "version:219" match_tag: < platform: "win32" > match_tag: < platform: "win_amd64" > > # Used by: # build/android wheel: < name: "infra/python/wheels/requests-py2_py3" version: "version:2.13.0" > libyuv-0.0~git20220104.b91df1a/AUTHORS000066400000000000000000000001421416500237200165730ustar00rootroot00000000000000# Names should be added to this file like so: # Name or Organization Google Inc. libyuv-0.0~git20220104.b91df1a/Android.bp000066400000000000000000000110331416500237200174270ustar00rootroot00000000000000package { default_applicable_licenses: ["external_libyuv_files_license"], } // Added automatically by a large-scale-change // // large-scale-change included anything that looked like it might be a license // text as a license_text. e.g. LICENSE, NOTICE, COPYING etc. // // Please consider removing redundant or irrelevant files from 'license_text:'. // See: http://go/android-license-faq license { name: "external_libyuv_files_license", visibility: [":__subpackages__"], license_kinds: [ "SPDX-license-identifier-BSD", ], license_text: [ "LICENSE", "LICENSE_THIRD_PARTY", "PATENTS", ], } cc_library { name: "libyuv", vendor_available: true, product_available: true, host_supported: true, vndk: { enabled: true, }, srcs: [ "source/compare.cc", "source/compare_common.cc", "source/compare_gcc.cc", "source/compare_mmi.cc", "source/compare_msa.cc", "source/compare_neon.cc", "source/compare_neon64.cc", "source/convert.cc", "source/convert_argb.cc", "source/convert_from.cc", "source/convert_from_argb.cc", "source/convert_jpeg.cc", "source/convert_to_argb.cc", "source/convert_to_i420.cc", "source/cpu_id.cc", "source/mjpeg_decoder.cc", "source/mjpeg_validate.cc", "source/planar_functions.cc", "source/rotate.cc", "source/rotate_any.cc", "source/rotate_argb.cc", "source/rotate_common.cc", "source/rotate_gcc.cc", "source/rotate_mmi.cc", "source/rotate_msa.cc", "source/rotate_neon.cc", "source/rotate_neon64.cc", "source/row_any.cc", "source/row_common.cc", "source/row_gcc.cc", "source/row_mmi.cc", "source/row_msa.cc", "source/row_neon.cc", "source/row_neon64.cc", "source/scale.cc", "source/scale_any.cc", "source/scale_argb.cc", "source/scale_common.cc", "source/scale_gcc.cc", "source/scale_mmi.cc", "source/scale_msa.cc", "source/scale_neon.cc", "source/scale_neon64.cc", "source/scale_uv.cc", "source/video_common.cc", ], cflags: [ "-Wall", "-Werror", "-Wno-unused-parameter", "-fexceptions", "-DHAVE_JPEG", ], arch: { arm: { cflags: ["-mfpu=neon"], }, }, shared_libs: ["libjpeg"], export_include_dirs: ["include"], apex_available: [ "//apex_available:platform", "com.android.media.swcodec", ], min_sdk_version: "29", } // compatibilty static library until all uses of libyuv_static are replaced // with libyuv (b/37646797) cc_library_static { name: "libyuv_static", vendor_available: true, whole_static_libs: ["libyuv"], apex_available: [ "//apex_available:platform", "com.android.media.swcodec", ], min_sdk_version: "29", } cc_test { name: "libyuv_unittest", static_libs: ["libyuv"], shared_libs: ["libjpeg"], cflags: ["-Wall", "-Werror"], srcs: [ "unit_test/basictypes_test.cc", "unit_test/color_test.cc", "unit_test/compare_test.cc", "unit_test/convert_test.cc", "unit_test/cpu_test.cc", "unit_test/cpu_thread_test.cc", "unit_test/math_test.cc", "unit_test/planar_test.cc", "unit_test/rotate_argb_test.cc", "unit_test/rotate_test.cc", "unit_test/scale_argb_test.cc", "unit_test/scale_test.cc", "unit_test/scale_uv_test.cc", "unit_test/unit_test.cc", "unit_test/video_common_test.cc", ], } cc_test { name: "compare", gtest: false, srcs: [ "util/compare.cc", ], static_libs: ["libyuv"], } cc_test { name: "cpuid", gtest: false, srcs: [ "util/cpuid.c", ], static_libs: ["libyuv"], } cc_test { name: "i444tonv12_eg", gtest: false, srcs: [ "util/i444tonv12_eg.cc", ], static_libs: ["libyuv"], } cc_test { name: "psnr", gtest: false, srcs: [ "util/psnr_main.cc", "util/psnr.cc", "util/ssim.cc", ], static_libs: ["libyuv"], } cc_test { name: "yuvconstants", gtest: false, srcs: [ "util/yuvconstants.c", ], static_libs: ["libyuv"], } cc_test { name: "yuvconvert", gtest: false, srcs: [ "util/yuvconvert.cc", ], static_libs: ["libyuv"], shared_libs: ["libjpeg"], } libyuv-0.0~git20220104.b91df1a/Android.mk000066400000000000000000000062761416500237200174520ustar00rootroot00000000000000# This is the Android makefile for libyuv for NDK. LOCAL_PATH:= $(call my-dir) include $(CLEAR_VARS) LOCAL_CPP_EXTENSION := .cc LOCAL_SRC_FILES := \ source/compare.cc \ source/compare_common.cc \ source/compare_gcc.cc \ source/compare_mmi.cc \ source/compare_msa.cc \ source/compare_neon.cc \ source/compare_neon64.cc \ source/compare_win.cc \ source/convert.cc \ source/convert_argb.cc \ source/convert_from.cc \ source/convert_from_argb.cc \ source/convert_to_argb.cc \ source/convert_to_i420.cc \ source/cpu_id.cc \ source/planar_functions.cc \ source/rotate.cc \ source/rotate_any.cc \ source/rotate_argb.cc \ source/rotate_common.cc \ source/rotate_gcc.cc \ source/rotate_mmi.cc \ source/rotate_msa.cc \ source/rotate_neon.cc \ source/rotate_neon64.cc \ source/rotate_win.cc \ source/row_any.cc \ source/row_common.cc \ source/row_gcc.cc \ source/row_mmi.cc \ source/row_msa.cc \ source/row_neon.cc \ source/row_neon64.cc \ source/row_win.cc \ source/scale.cc \ source/scale_any.cc \ source/scale_argb.cc \ source/scale_common.cc \ source/scale_gcc.cc \ source/scale_mmi.cc \ source/scale_msa.cc \ source/scale_neon.cc \ source/scale_neon64.cc \ source/scale_uv.cc \ source/scale_win.cc \ source/video_common.cc common_CFLAGS := -Wall -fexceptions ifneq ($(LIBYUV_DISABLE_JPEG), "yes") LOCAL_SRC_FILES += \ source/convert_jpeg.cc \ source/mjpeg_decoder.cc \ source/mjpeg_validate.cc common_CFLAGS += -DHAVE_JPEG LOCAL_SHARED_LIBRARIES := libjpeg endif LOCAL_CFLAGS += $(common_CFLAGS) LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include LOCAL_C_INCLUDES += $(LOCAL_PATH)/include LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)/include LOCAL_MODULE := libyuv_static LOCAL_MODULE_TAGS := optional include $(BUILD_STATIC_LIBRARY) include $(CLEAR_VARS) LOCAL_WHOLE_STATIC_LIBRARIES := libyuv_static LOCAL_MODULE := libyuv ifneq ($(LIBYUV_DISABLE_JPEG), "yes") LOCAL_SHARED_LIBRARIES := libjpeg endif include $(BUILD_SHARED_LIBRARY) include $(CLEAR_VARS) LOCAL_STATIC_LIBRARIES := libyuv_static LOCAL_SHARED_LIBRARIES := libjpeg LOCAL_MODULE_TAGS := tests LOCAL_CPP_EXTENSION := .cc LOCAL_C_INCLUDES += $(LOCAL_PATH)/include LOCAL_SRC_FILES := \ unit_test/basictypes_test.cc \ unit_test/color_test.cc \ unit_test/compare_test.cc \ unit_test/convert_test.cc \ unit_test/cpu_test.cc \ unit_test/cpu_thread_test.cc \ unit_test/math_test.cc \ unit_test/planar_test.cc \ unit_test/rotate_argb_test.cc \ unit_test/rotate_test.cc \ unit_test/scale_argb_test.cc \ unit_test/scale_test.cc \ unit_test/scale_uv_test.cc \ unit_test/unit_test.cc \ unit_test/video_common_test.cc LOCAL_MODULE := libyuv_unittest include $(BUILD_NATIVE_TEST) libyuv-0.0~git20220104.b91df1a/BUILD.gn000066400000000000000000000241121416500237200167130ustar00rootroot00000000000000# Copyright 2014 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. import("//testing/test.gni") import("libyuv.gni") declare_args() { # Set to false to disable building with absl flags. libyuv_use_absl_flags = true # When building a shared library using a target in WebRTC or # Chromium projects that depends on libyuv, setting this flag # to true makes libyuv symbols visible inside that library. libyuv_symbols_visible = false } config("libyuv_config") { include_dirs = [ "include" ] if (is_android && current_cpu == "arm64") { ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker64" ] } if (is_android && current_cpu != "arm64") { ldflags = [ "-Wl,--dynamic-linker,/system/bin/linker" ] } } # This target is built when no specific target is specified on the command line. group("default") { testonly = true deps = [ ":libyuv" ] if (libyuv_include_tests) { deps += [ ":compare", ":cpuid", ":i444tonv12_eg", ":libyuv_unittest", ":psnr", ":yuvconstants", ":yuvconvert", ] } } group("libyuv") { all_dependent_configs = [ ":libyuv_config" ] deps = [] if (is_win && target_cpu == "x64") { # Compile with clang in order to get inline assembly public_deps = [ ":libyuv_internal(//build/toolchain/win:win_clang_x64)" ] } else { public_deps = [ ":libyuv_internal" ] } if (libyuv_use_neon) { deps += [ ":libyuv_neon" ] } if (libyuv_use_msa) { deps += [ ":libyuv_msa" ] } if (libyuv_use_mmi) { deps += [ ":libyuv_mmi" ] } if (!is_ios && !libyuv_disable_jpeg) { # Make sure that clients of libyuv link with libjpeg. This can't go in # libyuv_internal because in Windows x64 builds that will generate a clang # build of libjpeg, and we don't want two copies. deps += [ "//third_party:jpeg" ] } } static_library("libyuv_internal") { visibility = [ ":*" ] sources = [ # Headers "include/libyuv.h", "include/libyuv/basic_types.h", "include/libyuv/compare.h", "include/libyuv/convert.h", "include/libyuv/convert_argb.h", "include/libyuv/convert_from.h", "include/libyuv/convert_from_argb.h", "include/libyuv/cpu_id.h", "include/libyuv/mjpeg_decoder.h", "include/libyuv/planar_functions.h", "include/libyuv/rotate.h", "include/libyuv/rotate_argb.h", "include/libyuv/rotate_row.h", "include/libyuv/row.h", "include/libyuv/scale.h", "include/libyuv/scale_argb.h", "include/libyuv/scale_row.h", "include/libyuv/scale_uv.h", "include/libyuv/version.h", "include/libyuv/video_common.h", # Source Files "source/compare.cc", "source/compare_common.cc", "source/compare_gcc.cc", "source/compare_win.cc", "source/convert.cc", "source/convert_argb.cc", "source/convert_from.cc", "source/convert_from_argb.cc", "source/convert_jpeg.cc", "source/convert_to_argb.cc", "source/convert_to_i420.cc", "source/cpu_id.cc", "source/mjpeg_decoder.cc", "source/mjpeg_validate.cc", "source/planar_functions.cc", "source/rotate.cc", "source/rotate_any.cc", "source/rotate_argb.cc", "source/rotate_common.cc", "source/rotate_gcc.cc", "source/rotate_win.cc", "source/row_any.cc", "source/row_common.cc", "source/row_gcc.cc", "source/row_win.cc", "source/scale.cc", "source/scale_any.cc", "source/scale_argb.cc", "source/scale_common.cc", "source/scale_gcc.cc", "source/scale_uv.cc", "source/scale_win.cc", "source/video_common.cc", ] configs += [ ":libyuv_config" ] defines = [] deps = [] if (libyuv_symbols_visible) { configs -= [ "//build/config/gcc:symbol_visibility_hidden" ] configs += [ "//build/config/gcc:symbol_visibility_default" ] } if (!is_ios && !libyuv_disable_jpeg) { defines += [ "HAVE_JPEG" ] # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps # because in Windows x64 build it will get compiled with clang. deps += [ "//third_party:jpeg_includes" ] } # Always enable optimization for Release and NaCl builds (to workaround # crbug.com/538243). if (!is_debug || is_nacl) { configs -= [ "//build/config/compiler:default_optimization" ] # Enable optimize for speed (-O2) over size (-Os). configs += [ "//build/config/compiler:optimize_max" ] } # To enable AVX2 or other cpu optimization, pass flag here if (!is_win) { cflags = [ # "-mpopcnt", # "-mavx2", # "-mfma", "-ffp-contract=fast", # Enable fma vectorization for NEON. ] } if (!libyuv_use_mmi) { defines += [ "LIBYUV_DISABLE_MMI" ] } } if (libyuv_use_neon) { static_library("libyuv_neon") { sources = [ # ARM Source Files "source/compare_neon.cc", "source/compare_neon64.cc", "source/rotate_neon.cc", "source/rotate_neon64.cc", "source/row_neon.cc", "source/row_neon64.cc", "source/scale_neon.cc", "source/scale_neon64.cc", ] deps = [ ":libyuv_internal" ] public_configs = [ ":libyuv_config" ] # Always enable optimization for Release and NaCl builds (to workaround # crbug.com/538243). if (!is_debug) { configs -= [ "//build/config/compiler:default_optimization" ] # Enable optimize for speed (-O2) over size (-Os). # TODO(fbarchard): Consider optimize_speed which is O3. configs += [ "//build/config/compiler:optimize_max" ] } if (current_cpu != "arm64") { configs -= [ "//build/config/compiler:compiler_arm_fpu" ] cflags = [ "-mfpu=neon" ] } } } if (libyuv_use_msa) { static_library("libyuv_msa") { sources = [ # MSA Source Files "source/compare_msa.cc", "source/rotate_msa.cc", "source/row_msa.cc", "source/scale_msa.cc", ] deps = [ ":libyuv_internal" ] public_configs = [ ":libyuv_config" ] } } if (libyuv_use_mmi) { static_library("libyuv_mmi") { sources = [ # MMI Source Files "source/compare_mmi.cc", "source/rotate_mmi.cc", "source/row_mmi.cc", "source/scale_mmi.cc", ] deps = [ ":libyuv_internal" ] public_configs = [ ":libyuv_config" ] } } if (libyuv_include_tests) { config("libyuv_unittest_warnings_config") { if (!is_win) { cflags = [ # TODO(fbarchard): Fix sign and unused variable warnings. "-Wno-sign-compare", "-Wno-unused-variable", ] } if (is_win) { cflags = [ "/wd4245", # signed/unsigned mismatch "/wd4189", # local variable is initialized but not referenced ] } } config("libyuv_unittest_config") { defines = [ "GTEST_RELATIVE_PATH" ] } test("libyuv_unittest") { testonly = true sources = [ "unit_test/basictypes_test.cc", "unit_test/color_test.cc", "unit_test/compare_test.cc", "unit_test/convert_test.cc", "unit_test/cpu_test.cc", "unit_test/cpu_thread_test.cc", "unit_test/math_test.cc", "unit_test/planar_test.cc", "unit_test/rotate_argb_test.cc", "unit_test/rotate_test.cc", "unit_test/scale_argb_test.cc", "unit_test/scale_test.cc", "unit_test/scale_uv_test.cc", "unit_test/unit_test.cc", "unit_test/unit_test.h", "unit_test/video_common_test.cc", ] deps = [ ":libyuv", "//testing/gtest", ] defines = [] if (libyuv_use_absl_flags) { defines += [ "LIBYUV_USE_ABSL_FLAGS" ] deps += [ "//third_party/abseil-cpp/absl/flags:flag", "//third_party/abseil-cpp/absl/flags:parse", ] } configs += [ ":libyuv_unittest_warnings_config" ] public_deps = [ "//testing/gtest" ] public_configs = [ ":libyuv_unittest_config" ] if (is_linux || is_chromeos) { cflags = [ "-fexceptions" ] } if (is_ios) { configs -= [ "//build/config/compiler:default_symbols" ] configs += [ "//build/config/compiler:symbols" ] cflags = [ "-Wno-sometimes-uninitialized" ] } if (!is_ios && !libyuv_disable_jpeg) { defines += [ "HAVE_JPEG" ] } if (is_android) { deps += [ "//testing/android/native_test:native_test_native_code" ] } # TODO(YangZhang): These lines can be removed when high accuracy # YUV to RGB to Neon is ported. if ((target_cpu == "armv7" || target_cpu == "armv7s" || (target_cpu == "arm" && arm_version >= 7) || target_cpu == "arm64") && (arm_use_neon || arm_optionally_use_neon)) { defines += [ "LIBYUV_NEON" ] } defines += [ # Enable the following 3 macros to turn off assembly for specified CPU. # "LIBYUV_DISABLE_X86", # "LIBYUV_DISABLE_NEON", # Enable the following macro to build libyuv as a shared library (dll). # "LIBYUV_USING_SHARED_LIBRARY" ] } executable("compare") { sources = [ # sources "util/compare.cc", ] deps = [ ":libyuv" ] if (is_linux || is_chromeos) { cflags = [ "-fexceptions" ] } } executable("yuvconvert") { sources = [ # sources "util/yuvconvert.cc", ] deps = [ ":libyuv" ] if (is_linux || is_chromeos) { cflags = [ "-fexceptions" ] } } executable("yuvconstants") { sources = [ # sources "util/yuvconstants.c", ] deps = [ ":libyuv" ] if (is_linux || is_chromeos) { cflags = [ "-fexceptions" ] } } executable("psnr") { sources = [ # sources "util/psnr.cc", "util/psnr_main.cc", "util/ssim.cc", ] deps = [ ":libyuv" ] if (!is_ios && !libyuv_disable_jpeg) { defines = [ "HAVE_JPEG" ] } } executable("i444tonv12_eg") { sources = [ # sources "util/i444tonv12_eg.cc", ] deps = [ ":libyuv" ] } executable("cpuid") { sources = [ # sources "util/cpuid.c", ] deps = [ ":libyuv" ] } } libyuv-0.0~git20220104.b91df1a/CM_linux_packages.cmake000066400000000000000000000050651416500237200221120ustar00rootroot00000000000000# determine the version number from the #define in libyuv/version.h EXECUTE_PROCESS ( COMMAND grep --perl-regex --only-matching "(?<=LIBYUV_VERSION )[0-9]+" include/libyuv/version.h WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} OUTPUT_VARIABLE YUV_VERSION_NUMBER OUTPUT_STRIP_TRAILING_WHITESPACE ) SET ( YUV_VER_MAJOR 0 ) SET ( YUV_VER_MINOR 0 ) SET ( YUV_VER_PATCH ${YUV_VERSION_NUMBER} ) SET ( YUV_VERSION ${YUV_VER_MAJOR}.${YUV_VER_MINOR}.${YUV_VER_PATCH} ) MESSAGE ( "Building ver.: ${YUV_VERSION}" ) # is this a 32-bit or 64-bit build? IF ( CMAKE_SIZEOF_VOID_P EQUAL 8 ) SET ( YUV_BIT_SIZE 64 ) ELSEIF ( CMAKE_SIZEOF_VOID_P EQUAL 4 ) SET ( YUV_BIT_SIZE 32 ) ELSE () MESSAGE ( FATAL_ERROR "CMAKE_SIZEOF_VOID_P=${CMAKE_SIZEOF_VOID_P}" ) ENDIF () # detect if this is a ARM build STRING (FIND "${CMAKE_CXX_COMPILER}" "arm-linux-gnueabihf-g++" pos) IF ( ${pos} EQUAL -1 ) SET ( YUV_CROSS_COMPILE_FOR_ARM7 FALSE ) ELSE () MESSAGE ( "Cross compiling for ARM7" ) SET ( YUV_CROSS_COMPILE_FOR_ARM7 TRUE ) ENDIF () STRING (FIND "${CMAKE_SYSTEM_PROCESSOR}" "arm" pos) IF ( ${pos} EQUAL -1 ) SET ( YUV_COMPILE_FOR_ARM7 FALSE ) ELSE () MESSAGE ( "Compiling for ARM" ) SET ( YUV_COMPILE_FOR_ARM7 TRUE ) ENDIF () # setup the sytem name, such as "x86-32", "amd-64", and "arm-32 IF ( ${YUV_CROSS_COMPILE_FOR_ARM7} OR ${YUV_COMPILE_FOR_ARM7} ) SET ( YUV_SYSTEM_NAME "armhf-${YUV_BIT_SIZE}" ) ELSE () IF ( YUV_BIT_SIZE EQUAL 32 ) SET ( YUV_SYSTEM_NAME "x86-${YUV_BIT_SIZE}" ) ELSE () SET ( YUV_SYSTEM_NAME "amd-${YUV_BIT_SIZE}" ) ENDIF () ENDIF () MESSAGE ( "Packaging for: ${YUV_SYSTEM_NAME}" ) # define all the variables needed by CPack to create .deb and .rpm packages SET ( CPACK_PACKAGE_VENDOR "Frank Barchard" ) SET ( CPACK_PACKAGE_CONTACT "fbarchard@chromium.org" ) SET ( CPACK_PACKAGE_VERSION ${YUV_VERSION} ) SET ( CPACK_PACKAGE_VERSION_MAJOR ${YUV_VER_MAJOR} ) SET ( CPACK_PACKAGE_VERSION_MINOR ${YUV_VER_MINOR} ) SET ( CPACK_PACKAGE_VERSION_PATCH ${YUV_VER_PATCH} ) SET ( CPACK_RESOURCE_FILE_LICENSE ${PROJECT_SOURCE_DIR}/LICENSE ) SET ( CPACK_SYSTEM_NAME "linux-${YUV_SYSTEM_NAME}" ) SET ( CPACK_PACKAGE_NAME "libyuv" ) SET ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "YUV library" ) SET ( CPACK_PACKAGE_DESCRIPTION "YUV library and YUV conversion tool" ) SET ( CPACK_DEBIAN_PACKAGE_SECTION "other" ) SET ( CPACK_DEBIAN_PACKAGE_PRIORITY "optional" ) SET ( CPACK_DEBIAN_PACKAGE_MAINTAINER "Frank Barchard " ) SET ( CPACK_GENERATOR "DEB;RPM" ) # create the .deb and .rpm files (you'll need build-essential and rpm tools) INCLUDE( CPack ) libyuv-0.0~git20220104.b91df1a/CMakeLists.txt000066400000000000000000000057651416500237200203030ustar00rootroot00000000000000# CMakeLists for libyuv # Originally created for "roxlu build system" to compile libyuv on windows # Run with -DTEST=ON to build unit tests PROJECT ( YUV C CXX ) # "C" is required even for C++ projects CMAKE_MINIMUM_REQUIRED( VERSION 2.8 ) OPTION( TEST "Built unit tests" OFF ) SET ( ly_base_dir ${PROJECT_SOURCE_DIR} ) SET ( ly_src_dir ${ly_base_dir}/source ) SET ( ly_inc_dir ${ly_base_dir}/include ) SET ( ly_tst_dir ${ly_base_dir}/unit_test ) SET ( ly_lib_name yuv ) SET ( ly_lib_static ${ly_lib_name} ) SET ( ly_lib_shared ${ly_lib_name}_shared ) FILE ( GLOB_RECURSE ly_source_files ${ly_src_dir}/*.cc ) LIST ( SORT ly_source_files ) FILE ( GLOB_RECURSE ly_unittest_sources ${ly_tst_dir}/*.cc ) LIST ( SORT ly_unittest_sources ) INCLUDE_DIRECTORIES( BEFORE ${ly_inc_dir} ) # this creates the static library (.a) ADD_LIBRARY ( ${ly_lib_static} STATIC ${ly_source_files} ) # this creates the shared library (.so) ADD_LIBRARY ( ${ly_lib_shared} SHARED ${ly_source_files} ) SET_TARGET_PROPERTIES ( ${ly_lib_shared} PROPERTIES OUTPUT_NAME "${ly_lib_name}" ) SET_TARGET_PROPERTIES ( ${ly_lib_shared} PROPERTIES PREFIX "lib" ) # this creates the conversion tool ADD_EXECUTABLE ( yuvconvert ${ly_base_dir}/util/yuvconvert.cc ) TARGET_LINK_LIBRARIES ( yuvconvert ${ly_lib_static} ) INCLUDE ( FindJPEG ) if (JPEG_FOUND) include_directories( ${JPEG_INCLUDE_DIR} ) target_link_libraries( yuvconvert ${JPEG_LIBRARY} ) add_definitions( -DHAVE_JPEG ) endif() if(TEST) find_library(GTEST_LIBRARY gtest) if(GTEST_LIBRARY STREQUAL "GTEST_LIBRARY-NOTFOUND") set(GTEST_SRC_DIR /usr/src/gtest CACHE STRING "Location of gtest sources") if(EXISTS ${GTEST_SRC_DIR}/src/gtest-all.cc) message(STATUS "building gtest from sources in ${GTEST_SRC_DIR}") set(gtest_sources ${GTEST_SRC_DIR}/src/gtest-all.cc) add_library(gtest STATIC ${gtest_sources}) include_directories(${GTEST_SRC_DIR}) include_directories(${GTEST_SRC_DIR}/include) set(GTEST_LIBRARY gtest) else() message(FATAL_ERROR "TEST is set but unable to find gtest library") endif() endif() add_executable(libyuv_unittest ${ly_unittest_sources}) target_link_libraries(libyuv_unittest ${ly_lib_name} ${GTEST_LIBRARY}) find_library(PTHREAD_LIBRARY pthread) if(NOT PTHREAD_LIBRARY STREQUAL "PTHREAD_LIBRARY-NOTFOUND") target_link_libraries(libyuv_unittest pthread) endif() if (JPEG_FOUND) target_link_libraries(libyuv_unittest ${JPEG_LIBRARY}) endif() if(NACL AND NACL_LIBC STREQUAL "newlib") target_link_libraries(libyuv_unittest glibc-compat) endif() endif() # install the conversion tool, .so, .a, and all the header files INSTALL ( PROGRAMS ${CMAKE_BINARY_DIR}/yuvconvert DESTINATION bin ) INSTALL ( TARGETS ${ly_lib_static} DESTINATION lib ) INSTALL ( TARGETS ${ly_lib_shared} LIBRARY DESTINATION lib RUNTIME DESTINATION bin ) INSTALL ( DIRECTORY ${PROJECT_SOURCE_DIR}/include/ DESTINATION include ) # create the .deb and .rpm packages using cpack INCLUDE ( CM_linux_packages.cmake ) libyuv-0.0~git20220104.b91df1a/DEPS000066400000000000000000002371561416500237200162220ustar00rootroot00000000000000gclient_gn_args_file = 'src/build/config/gclient_args.gni' gclient_gn_args = [ 'generate_location_tags', ] vars = { 'chromium_git': 'https://chromium.googlesource.com', 'chromium_revision': '829c6df33dce1085a61d8fd44209fc84bbf9a6a7', 'gn_version': 'git_revision:6f13aaac55a977e1948910942675c69f2b4f7a94', # Keep the Chromium default of generating location tags. 'generate_location_tags': True, } deps = { 'src/build': Var('chromium_git') + '/chromium/src/build' + '@' + 'dcea3443035f48d58193788e0bc56daca4e5db33', 'src/buildtools': Var('chromium_git') + '/chromium/src/buildtools' + '@' + '075dd7e22837a69189003e4fa84499acf63188cf', 'src/testing': Var('chromium_git') + '/chromium/src/testing' + '@' + 'f4e42be13265ec304b0f3085eee2b15f30f44077', 'src/third_party': Var('chromium_git') + '/chromium/src/third_party' + '@' + '42c249feeb71bc0cd184849f0509aefef599343d', 'src/buildtools/linux64': { 'packages': [ { 'package': 'gn/gn/linux-amd64', 'version': Var('gn_version'), } ], 'dep_type': 'cipd', 'condition': 'checkout_linux', }, 'src/buildtools/mac': { 'packages': [ { 'package': 'gn/gn/mac-amd64', 'version': Var('gn_version'), } ], 'dep_type': 'cipd', 'condition': 'checkout_mac', }, 'src/buildtools/win': { 'packages': [ { 'package': 'gn/gn/windows-amd64', 'version': Var('gn_version'), } ], 'dep_type': 'cipd', 'condition': 'checkout_win', }, 'src/buildtools/clang_format/script': Var('chromium_git') + '/external/github.com/llvm/llvm-project/clang/tools/clang-format.git' + '@' + '99876cacf78329e5f99c244dbe42ccd1654517a0', 'src/buildtools/third_party/libc++/trunk': Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxx.git' + '@' + '79a2e924d96e2fc1e4b937c42efd08898fa472d7', 'src/buildtools/third_party/libc++abi/trunk': Var('chromium_git') + '/external/github.com/llvm/llvm-project/libcxxabi.git' + '@' + '665b74f7d1b3bb295cd6ba7d8fcec1acd3d2ac84', 'src/buildtools/third_party/libunwind/trunk': Var('chromium_git') + '/external/github.com/llvm/llvm-project/libunwind.git' + '@' + 'f51a154281bdfe746c46c07cd4fb05be97f9441d', 'src/third_party/catapult': Var('chromium_git') + '/catapult.git' + '@' + '75423c310eb303d28978be892fcf7b9c2c824909', 'src/third_party/colorama/src': Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8', 'src/third_party/depot_tools': Var('chromium_git') + '/chromium/tools/depot_tools.git' + '@' + '2ffa1bde797a8127c0f72908d0bd74051fd65d0d', 'src/third_party/freetype/src': Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'cff026d41599945498044d2f4dcc0e610ffb6929', 'src/third_party/googletest/src': Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + 'e2f3978937c0244508135f126e2617a7734a68be', 'src/third_party/harfbuzz-ng/src': Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '64b29dbd5994a511acee69cb9b45ad650ef88359', 'src/third_party/libjpeg_turbo': Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '02959c3ee17abacfd1339ec22ea93301292ffd56', 'src/third_party/nasm': Var('chromium_git') + '/chromium/deps/nasm.git' + '@' + '9215e8e1d0fe474ffd3e16c1a07a0f97089e6224', 'src/tools': Var('chromium_git') + '/chromium/src/tools' + '@' + '198dc879529652b39ba6e223bcc0bcad5f1facd6', # libyuv-only dependencies (not present in Chromium). 'src/third_party/gtest-parallel': Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e', 'src/third_party/lss': { 'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + '92a65a8f5d705d1928874420c8d0d15bde8c89e5', 'condition': 'checkout_android or checkout_linux', }, # Android deps: 'src/third_party/accessibility_test_framework': { 'packages': [ { 'package': 'chromium/third_party/accessibility-test-framework', 'version': 'b5ec1e56e58e56bc1a0c77d43111c37f9b512c8a', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/auto/src': { 'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + 'fe67d853d6356943dc79541c892ab6d3e6a7b61a', 'condition': 'checkout_android', }, 'src/third_party/boringssl/src': 'https://boringssl.googlesource.com/boringssl.git' + '@' + '3a667d10e94186fd503966f5638e134fe9fb4080', 'src/base': { 'url': Var('chromium_git') + '/chromium/src/base' + '@' + 'e9e639622449a893a1b5e32781d072cec08ead72', 'condition': 'checkout_android', }, 'src/third_party/bazel': { 'packages': [ { 'package': 'chromium/third_party/bazel', 'version': 'VjMsf48QUWw8n7XtJP2AuSjIGmbQeYdWdwyxVvIRLmAC', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/bouncycastle': { 'packages': [ { 'package': 'chromium/third_party/bouncycastle', 'version': 'c078e87552ba26e776566fdaf0f22cd8712743d0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_ndk': { 'url': Var('chromium_git') + '/android_ndk.git' + '@' + '401019bf85744311b26c88ced255cd53401af8b7', 'condition': 'checkout_android', }, 'src/third_party/androidx': { 'packages': [ { 'package': 'chromium/third_party/androidx', 'version': '6d8ij5pzYh29WWjPbdbAWFBJSA1nUgkWf2p6wCVZKIsC', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_support_test_runner': { 'packages': [ { 'package': 'chromium/third_party/android_support_test_runner', 'version': '96d4bf848cd210fdcbca6bcc8c1b4b39cbd93141', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_sdk/public': { 'packages': [ { 'package': 'chromium/third_party/android_sdk/public/build-tools/31.0.0', 'version': 'tRoD45SCi7UleQqSV7MrMQO1_e5P8ysphkCcj6z_cCQC', }, { 'package': 'chromium/third_party/android_sdk/public/emulator', 'version': 'gMHhUuoQRKfxr-MBn3fNNXZtkAVXtOwMwT7kfx8jkIgC', }, { 'package': 'chromium/third_party/android_sdk/public/extras', 'version': 'ppQ4TnqDvBHQ3lXx5KPq97egzF5X2FFyOrVHkGmiTMQC', }, { 'package': 'chromium/third_party/android_sdk/public/patcher', 'version': 'I6FNMhrXlpB-E1lOhMlvld7xt9lBVNOO83KIluXDyA0C', }, { 'package': 'chromium/third_party/android_sdk/public/platform-tools', 'version': 'g7n_-r6yJd_SGRklujGB1wEt8iyr77FZTUJVS9w6O34C', }, { 'package': 'chromium/third_party/android_sdk/public/platforms/android-31', 'version': 'lL3IGexKjYlwjO_1Ga-xwxgwbE_w-lmi2Zi1uOlWUIAC', }, { 'package': 'chromium/third_party/android_sdk/public/sources/android-31', 'version': '_a_BcnANjPYw5mSKlNHa7GFY8yc1kdqj2rmQgac7yUcC', }, { 'package': 'chromium/third_party/android_sdk/public/cmdline-tools', 'version': 'Ez2NWws2SJYCF6qw2O-mSCqK6424l3ZdSTpppLyVR_cC', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/tools/clang/dsymutil': { 'packages': [ { 'package': 'chromium/llvm-build-tools/dsymutil', 'version': 'OWlhXkmj18li3yhJk59Kmjbc5KdgLh56TwCd1qBdzlIC', } ], 'condition': 'checkout_mac', 'dep_type': 'cipd', }, 'src/third_party/android_build_tools/aapt2': { 'packages': [ { 'package': 'chromium/third_party/android_build_tools/aapt2', 'version': 'version:3.6.0-alpha03-5516695-cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/byte_buddy': { 'packages': [ { 'package': 'chromium/third_party/byte_buddy', 'version': 'c9b53316603fc2d997c899c7ca1707f809b918cd', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/ced/src': { 'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + 'ba412eaaacd3186085babcd901679a48863c7dd5', 'condition': 'checkout_android', }, 'src/third_party/errorprone/lib': { 'url': Var('chromium_git') + '/chromium/third_party/errorprone.git' + '@' + '980d49e839aa4984015efed34b0134d4b2c9b6d7', 'condition': 'checkout_android', }, 'src/third_party/findbugs': { 'url': Var('chromium_git') + '/chromium/deps/findbugs.git' + '@' + '4275d9ac8610db6b1bc9a5e887f97e41b33fac67', 'condition': 'checkout_android', }, 'src/third_party/gson': { 'packages': [ { 'package': 'chromium/third_party/gson', 'version': '681931c9778045903a0ed59856ce2dd8dd7bf7ca', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/guava': { 'packages': [ { 'package': 'chromium/third_party/guava', 'version': 'a6fba501f3a0de88b9be1daa2052632de5b96a46', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/hamcrest': { 'packages': [ { 'package': 'chromium/third_party/hamcrest', 'version': '37eccfc658fe79695d6abb6dd497463c4372032f', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/icu': { 'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'bf66d373ae781a3498f2babe7b61d933dd774b82', }, 'src/third_party/icu4j': { 'packages': [ { 'package': 'chromium/third_party/icu4j', 'version': 'e87e5bed2b4935913ee26a3ebd0b723ee2344354', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/intellij': { 'packages': [ { 'package': 'chromium/third_party/intellij', 'version': '77c2721b024b36ee073402c08e6d8428c0295336', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/jdk': { 'packages': [ { 'package': 'chromium/third_party/jdk', 'version': 'PfRSnxe8Od6WU4zBXomq-zsgcJgWmm3z4gMQNB-r2QcC', }, { 'package': 'chromium/third_party/jdk/extras', 'version': 'fkhuOQ3r-zKtWEdKplpo6k0vKkjl-LY_rJTmtzFCQN4C', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/jsr-305/src': { 'url': Var('chromium_git') + '/external/jsr-305.git' + '@' + '642c508235471f7220af6d5df2d3210e3bfc0919', 'condition': 'checkout_android', }, 'src/third_party/junit/src': { 'url': Var('chromium_git') + '/external/junit.git' + '@' + '64155f8a9babcfcf4263cf4d08253a1556e75481', 'condition': 'checkout_android', }, 'src/third_party/libunwindstack': { 'url': Var('chromium_git') + '/chromium/src/third_party/libunwindstack.git' + '@' + '6868358481bb1e5e20d155c1084dc436c88b5e6b', 'condition': 'checkout_android', }, 'src/third_party/mockito/src': { 'url': Var('chromium_git') + '/external/mockito/mockito.git' + '@' + '04a2a289a4222f80ad20717c25144981210d2eac', 'condition': 'checkout_android', }, 'src/third_party/objenesis': { 'packages': [ { 'package': 'chromium/third_party/objenesis', 'version': '9e367f55e5a65781ee77bfcbaa88fb82b30e75c0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/ow2_asm': { 'packages': [ { 'package': 'chromium/third_party/ow2_asm', 'version': 'NNAhdJzMdnutUVqfSJm5v0tVazA9l3Dd6CRwH6N4Q5kC', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/r8': { 'packages': [ { 'package': 'chromium/third_party/r8', 'version': 'Nu_mvQJe34CotIXadFlA3w732CJ9EvQGuVs4udcZedAC', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/proguard': { 'packages': [ { 'package': 'chromium/third_party/proguard', 'version': 'Fd91BJFVlmiO6c46YMTsdy7n2f5Sk2hVVGlzPLvqZPsC', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/requests/src': { 'url': Var('chromium_git') + '/external/github.com/kennethreitz/requests.git' + '@' + 'refs/tags/v2.23.0', 'condition': 'checkout_android', }, 'src/third_party/robolectric': { 'packages': [ { 'package': 'chromium/third_party/robolectric', 'version': 'iC6RDM5EH3GEAzR-1shW_Mg0FeeNE5shq1okkFfuuNQC', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/sqlite4java': { 'packages': [ { 'package': 'chromium/third_party/sqlite4java', 'version': '889660698187baa7c8b0d79f7bf58563125fbd66', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/turbine': { 'packages': [ { 'package': 'chromium/third_party/turbine', 'version': 'Om6yIEXgJxuqghErK29h9RcMH6VaymMbxwScwXmcN6EC', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/ub-uiautomator/lib': { 'url': Var('chromium_git') + '/chromium/third_party/ub-uiautomator.git' + '@' + '00270549ce3161ae72ceb24712618ea28b4f9434', 'condition': 'checkout_android', }, # iOS deps: 'src/ios': { 'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '81826d980c159f949c2c7901f4dbec9a09788964', 'condition': 'checkout_ios' }, # Everything coming after this is automatically updated by the auto-roller. # === ANDROID_DEPS Generated Code Start === 'src/third_party/android_deps/libs/android_arch_core_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_core_common', 'version': 'version:2@1.1.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/android_arch_core_runtime': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_core_runtime', 'version': 'version:2@1.1.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/android_arch_lifecycle_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common', 'version': 'version:2@1.1.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/android_arch_lifecycle_common_java8': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common_java8', 'version': 'version:2@1.1.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/android_arch_lifecycle_livedata': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata', 'version': 'version:2@1.1.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/android_arch_lifecycle_livedata_core': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata_core', 'version': 'version:2@1.1.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/android_arch_lifecycle_runtime': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime', 'version': 'version:2@1.1.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/android_arch_lifecycle_viewmodel': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_viewmodel', 'version': 'version:2@1.1.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/backport_util_concurrent_backport_util_concurrent', 'version': 'version:2@3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/classworlds_classworlds': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/classworlds_classworlds', 'version': 'version:2@1.1-alpha-2.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_appcompat_v7': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_asynclayoutinflater': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_asynclayoutinflater', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_cardview_v7': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_collections': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_collections', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_coordinatorlayout': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_coordinatorlayout', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_cursoradapter': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_cursoradapter', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_customview': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_customview', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_design': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_design', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_documentfile': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_documentfile', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_drawerlayout': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_drawerlayout', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_interpolator': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_interpolator', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_loader': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_loader', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_localbroadcastmanager': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_localbroadcastmanager', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_multidex': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_multidex', 'version': 'version:2@1.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_print': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_print', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_recyclerview_v7': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_slidingpanelayout': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_slidingpanelayout', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_support_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_support_compat': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_support_core_ui': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_support_core_utils': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_support_fragment': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_support_media_compat': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_support_v4': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_support_vector_drawable': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_swiperefreshlayout': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_swiperefreshlayout', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_transition': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_transition', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_versionedparcelable': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_versionedparcelable', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_support_viewpager': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_support_viewpager', 'version': 'version:2@28.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_tools_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_tools_common', 'version': 'version:2@30.0.0-alpha10.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs', 'version': 'version:2@1.1.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_tools_desugar_jdk_libs_configuration', 'version': 'version:2@1.1.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_tools_layoutlib_layoutlib_api', 'version': 'version:2@30.0.0-alpha10.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_android_tools_sdk_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_android_tools_sdk_common', 'version': 'version:2@30.0.0-alpha10.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_github_ben_manes_caffeine_caffeine', 'version': 'version:2@2.8.8.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_github_kevinstern_software_and_algorithms', 'version': 'version:2@1.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_datatransport_transport_api': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_datatransport_transport_api', 'version': 'version:2@2.2.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone', 'version': 'version:2@17.5.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_base': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base', 'version': 'version:2@17.5.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_basement': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement', 'version': 'version:2@17.5.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cloud_messaging', 'version': 'version:2@16.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_fido': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_fido', 'version': 'version:2@19.0.0-beta.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_flags': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_flags', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_gcm': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_iid': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_location': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_stats': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_stats', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_tasks': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks', 'version': 'version:2@17.2.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision', 'version': 'version:2@18.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common', 'version': 'version:2@18.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_material_material': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_material_material', 'version': 'version:2@1.6.0-alpha01.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_android_play_core': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core', 'version': 'version:2@1.10.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_auto_auto_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_auto_auto_common', 'version': 'version:2@1.1.2.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_auto_service_auto_service': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service', 'version': 'version:2@1.0-rc6.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_auto_service_auto_service_annotations', 'version': 'version:2@1.0-rc6.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_auto_value_auto_value_annotations', 'version': 'version:2@1.7.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_code_findbugs_jformatstring': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jformatstring', 'version': 'version:2@3.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_code_findbugs_jsr305': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jsr305', 'version': 'version:2@3.0.2.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_code_gson_gson': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_code_gson_gson', 'version': 'version:2@2.8.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_dagger_dagger': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger', 'version': 'version:2@2.30.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_dagger_dagger_compiler': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_compiler', 'version': 'version:2@2.30.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_dagger_dagger_producers': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_producers', 'version': 'version:2@2.30.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_dagger_dagger_spi': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_spi', 'version': 'version:2@2.30.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotation', 'version': 'version:2@2.10.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations', 'version': 'version:2@2.10.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_check_api', 'version': 'version:2@2.10.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_core': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_core', 'version': 'version:2@2.10.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_type_annotations', 'version': 'version:2@2.10.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_errorprone_javac': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac', 'version': 'version:2@9+181-r4173-1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_errorprone_javac_shaded': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac_shaded', 'version': 'version:2@9-dev-r4023-3.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_firebase_firebase_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_annotations', 'version': 'version:2@16.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_firebase_firebase_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_common', 'version': 'version:2@19.5.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_firebase_firebase_components': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_components', 'version': 'version:2@16.1.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders', 'version': 'version:2@16.1.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_encoders_json', 'version': 'version:2@17.1.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_firebase_firebase_iid': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid', 'version': 'version:2@21.0.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_iid_interop', 'version': 'version:2@17.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_firebase_firebase_installations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations', 'version': 'version:2@16.3.5.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_installations_interop', 'version': 'version:2@16.0.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_measurement_connector', 'version': 'version:2@18.0.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_firebase_firebase_messaging': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_firebase_firebase_messaging', 'version': 'version:2@21.0.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_flatbuffers_flatbuffers_java': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_flatbuffers_flatbuffers_java', 'version': 'version:2@2.0.3.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format', 'version': 'version:2@1.5.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_guava_failureaccess': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_guava_failureaccess', 'version': 'version:2@1.0.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_guava_guava': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava', 'version': 'version:2@31.0-jre.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_guava_guava_android': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava_android', 'version': 'version:2@31.0-android.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_guava_listenablefuture': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_guava_listenablefuture', 'version': 'version:2@1.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations', 'version': 'version:2@1.3.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_protobuf_protobuf_java': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_java', 'version': 'version:2@3.4.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_javalite', 'version': 'version:2@3.13.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_googlecode_java_diff_utils_diffutils', 'version': 'version:2@1.3.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_squareup_javapoet': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_squareup_javapoet', 'version': 'version:2@1.13.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/com_squareup_javawriter': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/com_squareup_javawriter', 'version': 'version:2@2.1.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/io_github_java_diff_utils_java_diff_utils', 'version': 'version:2@4.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/javax_annotation_javax_annotation_api': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/javax_annotation_javax_annotation_api', 'version': 'version:2@1.3.2.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/javax_annotation_jsr250_api': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api', 'version': 'version:2@1.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/javax_inject_javax_inject': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject', 'version': 'version:2@1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/nekohtml_nekohtml': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/nekohtml_nekohtml', 'version': 'version:2@1.9.6.2.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/nekohtml_xercesminimal': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/nekohtml_xercesminimal', 'version': 'version:2@1.9.6.2.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/net_ltgt_gradle_incap_incap': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/net_ltgt_gradle_incap_incap', 'version': 'version:2@0.2.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/net_sf_kxml_kxml2': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/net_sf_kxml_kxml2', 'version': 'version:2@2.3.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_ant_ant': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant', 'version': 'version:2@1.8.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_ant_ant_launcher': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_ant_ant_launcher', 'version': 'version:2@1.8.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_ant_tasks', 'version': 'version:2@2.1.3.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_maven_artifact': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact', 'version': 'version:2@2.2.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_artifact_manager', 'version': 'version:2@2.2.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_error_diagnostics', 'version': 'version:2@2.2.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_maven_model': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_model', 'version': 'version:2@2.2.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_plugin_registry', 'version': 'version:2@2.2.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_maven_profile': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_profile', 'version': 'version:2@2.2.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_maven_project': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_project', 'version': 'version:2@2.2.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_repository_metadata', 'version': 'version:2@2.2.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_maven_settings': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_maven_settings', 'version': 'version:2@2.2.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_file', 'version': 'version:2@1.0-beta-6.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_lightweight', 'version': 'version:2@1.0-beta-6.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_http_shared', 'version': 'version:2@1.0-beta-6.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_apache_maven_wagon_wagon_provider_api', 'version': 'version:2@1.0-beta-6.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_ccil_cowan_tagsoup_tagsoup', 'version': 'version:2@1.2.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual', 'version': 'version:2@2.5.5.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_checkerframework_checker_qual': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_qual', 'version': 'version:2@3.12.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_checkerframework_dataflow_errorprone', 'version': 'version:2@3.15.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations', 'version': 'version:2@1.17.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_container_default', 'version': 'version:2@1.0-alpha-9-stable-1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_interpolation', 'version': 'version:2@1.11.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_codehaus_plexus_plexus_utils', 'version': 'version:2@1.5.15.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_eclipse_jgit_org_eclipse_jgit', 'version': 'version:2@4.4.1.201607150455-r.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_jetbrains_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_annotations', 'version': 'version:2@13.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib', 'version': 'version:2@1.6.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_common', 'version': 'version:2@1.6.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk7', 'version': 'version:2@1.5.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlin_kotlin_stdlib_jdk8', 'version': 'version:2@1.5.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_android', 'version': 'version:2@1.5.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_coroutines_core_jvm', 'version': 'version:2@1.5.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_jetbrains_kotlinx_kotlinx_metadata_jvm', 'version': 'version:2@0.1.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_ow2_asm_asm': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm', 'version': 'version:2@7.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_ow2_asm_asm_analysis': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_analysis', 'version': 'version:2@7.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_ow2_asm_asm_commons': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_commons', 'version': 'version:2@7.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_ow2_asm_asm_tree': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_tree', 'version': 'version:2@7.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_ow2_asm_asm_util': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_ow2_asm_asm_util', 'version': 'version:2@7.0.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_pcollections_pcollections': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_pcollections_pcollections', 'version': 'version:2@2.1.2.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_annotations': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_annotations', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_junit': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_junit', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_pluginapi': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_pluginapi', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_plugins_maven_dependency_resolver', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_resources': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_resources', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_robolectric': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_robolectric', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_sandbox': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_sandbox', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_shadowapi': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadowapi', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_shadows_framework': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_framework', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_shadows_playservices': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_shadows_playservices', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_utils': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, 'src/third_party/android_deps/libs/org_robolectric_utils_reflector': { 'packages': [ { 'package': 'chromium/third_party/android_deps/libs/org_robolectric_utils_reflector', 'version': 'version:2@4.3.1.cr0', }, ], 'condition': 'checkout_android', 'dep_type': 'cipd', }, # === ANDROID_DEPS Generated Code End === } pre_deps_hooks = [ { # Remove any symlinks from before 177567c518b121731e507e9b9c4049c4dc96e4c8. # TODO(kjellander): Remove this in March 2017. 'name': 'cleanup_links', 'pattern': '.', 'action': ['python3', 'src/cleanup_links.py'], }, ] hooks = [ { # This clobbers when necessary (based on get_landmines.py). It should be # an early hook but it will need to be run after syncing Chromium and # setting up the links, so the script actually exists. 'name': 'landmines', 'pattern': '.', 'action': [ 'python3', 'src/build/landmines.py', '--landmine-scripts', 'src/tools_libyuv/get_landmines.py', '--src-dir', 'src', ], }, # Downloads the current stable linux sysroot to build/linux/ if needed. { 'name': 'sysroot_arm', 'pattern': '.', 'condition': 'checkout_linux and checkout_arm', 'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py', '--arch=arm'], }, { 'name': 'sysroot_arm64', 'pattern': '.', 'condition': 'checkout_linux and checkout_arm64', 'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py', '--arch=arm64'], }, { 'name': 'sysroot_x86', 'pattern': '.', 'condition': 'checkout_linux and (checkout_x86 or checkout_x64)', 'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py', '--arch=x86'], }, { 'name': 'sysroot_mips', 'pattern': '.', 'condition': 'checkout_linux and checkout_mips', 'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py', '--arch=mips'], }, { 'name': 'sysroot_x64', 'pattern': '.', 'condition': 'checkout_linux and checkout_x64', 'action': ['python3', 'src/build/linux/sysroot_scripts/install-sysroot.py', '--arch=x64'], }, { # Update the Windows toolchain if necessary. 'name': 'win_toolchain', 'pattern': '.', 'action': ['python3', 'src/build/vs_toolchain.py', 'update'], }, { # Update the Mac toolchain if necessary. 'name': 'mac_toolchain', 'pattern': '.', 'action': ['python3', 'src/build/mac_toolchain.py'], 'condition': 'checkout_mac', }, { 'name': 'msan_chained_origins', 'pattern': '.', 'condition': 'checkout_instrumented_libraries', 'action': [ 'python', 'src/third_party/depot_tools/download_from_google_storage.py', "--no_resume", "--no_auth", "--bucket", "chromium-instrumented-libraries", "-s", "src/third_party/instrumented_libraries/binaries/msan-chained-origins.tgz.sha1", ], }, { 'name': 'msan_no_origins', 'pattern': '.', 'condition': 'checkout_instrumented_libraries', 'action': [ 'python', 'src/third_party/depot_tools/download_from_google_storage.py', "--no_resume", "--no_auth", "--bucket", "chromium-instrumented-libraries", "-s", "src/third_party/instrumented_libraries/binaries/msan-no-origins.tgz.sha1", ], }, { # Pull clang if needed or requested via GYP_DEFINES. # Note: On Win, this should run after win_toolchain, as it may use it. 'name': 'clang', 'pattern': '.', 'action': ['python3', 'src/tools/clang/scripts/update.py'], }, { # Update LASTCHANGE. 'name': 'lastchange', 'pattern': '.', 'action': ['python3', 'src/build/util/lastchange.py', '-o', 'src/build/util/LASTCHANGE'], }, # Pull clang-format binaries using checked-in hashes. { 'name': 'clang_format_win', 'pattern': '.', 'action': [ 'download_from_google_storage', '--no_resume', '--platform=win32', '--no_auth', '--bucket', 'chromium-clang-format', '-s', 'src/buildtools/win/clang-format.exe.sha1', ], }, { 'name': 'clang_format_mac', 'pattern': '.', 'action': [ 'download_from_google_storage', '--no_resume', '--platform=darwin', '--no_auth', '--bucket', 'chromium-clang-format', '-s', 'src/buildtools/mac/clang-format.sha1', ], }, { 'name': 'clang_format_linux', 'pattern': '.', 'condition': 'host_os == "linux"', 'action': [ 'download_from_google_storage', '--no_resume', '--platform=linux*', '--no_auth', '--bucket', 'chromium-clang-format', '-s', 'src/buildtools/linux64/clang-format.sha1', ], }, # Pull luci-go binaries (isolate, swarming) using checked-in hashes. { 'name': 'luci-go_win', 'pattern': '.', 'action': [ 'download_from_google_storage', '--no_resume', '--platform=win32', '--no_auth', '--bucket', 'chromium-luci', '-d', 'src/tools/luci-go/win64', ], }, { 'name': 'luci-go_mac', 'pattern': '.', 'action': [ 'download_from_google_storage', '--no_resume', '--platform=darwin', '--no_auth', '--bucket', 'chromium-luci', '-d', 'src/tools/luci-go/mac64', ], }, { 'name': 'luci-go_linux', 'pattern': '.', 'action': [ 'download_from_google_storage', '--no_resume', '--platform=linux*', '--no_auth', '--bucket', 'chromium-luci', '-d', 'src/tools/luci-go/linux64', ], }, { # We used to use src as a CIPD root. We moved it to a different directory # in crrev.com/c/930178 but left the clobber here to ensure that that CL # could be reverted safely. This can be safely removed once crbug.com/794764 # is resolved. 'name': 'Android Clobber Deprecated CIPD Root', 'pattern': '.', 'condition': 'checkout_android', 'action': ['src/build/cipd/clobber_cipd_root.py', '--root', 'src', ], }, { 'name': 'Generate component metadata for tests', 'pattern': '.', 'action': [ 'vpython3', 'src/testing/generate_location_tags.py', '--out', 'src/testing/location_tags.json', ], }, ] recursedeps = [] libyuv-0.0~git20220104.b91df1a/DIR_METADATA000066400000000000000000000000641416500237200173270ustar00rootroot00000000000000monorail { component: "Internals>Images>Codecs" } libyuv-0.0~git20220104.b91df1a/LICENSE000066400000000000000000000027421416500237200165400ustar00rootroot00000000000000Copyright 2011 The LibYuv Project Authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Google nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. libyuv-0.0~git20220104.b91df1a/OWNERS000066400000000000000000000003351416500237200164670ustar00rootroot00000000000000mbonadei@chromium.org fbarchard@chromium.org magjed@chromium.org pbos@chromium.org per-file *.gn=mbonadei@chromium.org per-file .gitignore=* per-file AUTHORS=* per-file DEPS=* per-file PRESUBMIT.py=mbonadei@chromium.org libyuv-0.0~git20220104.b91df1a/PATENTS000066400000000000000000000025531416500237200165740ustar00rootroot00000000000000Additional IP Rights Grant (Patents) "This implementation" means the copyrightable works distributed by Google as part of the LibYuv code package. Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, transfer, and otherwise run, modify and propagate the contents of this implementation of the LibYuv code package, where such license applies only to those patent claims, both currently owned by Google and acquired in the future, licensable by Google that are necessarily infringed by this implementation of the LibYuv code package. This grant does not include claims that would be infringed only as a consequence of further modification of this implementation. If you or your agent or exclusive licensee institute or order or agree to the institution of patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that this implementation of the LibYuv code package or any code incorporated within this implementation of the LibYuv code package constitutes direct or contributory patent infringement, or inducement of patent infringement, then any patent rights granted to you under this License for this implementation of the LibYuv code package shall terminate as of the date such litigation is filed.libyuv-0.0~git20220104.b91df1a/PRESUBMIT.py000066400000000000000000000037331416500237200174600ustar00rootroot00000000000000# Copyright 2017 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. def _CommonChecks(input_api, output_api): """Checks common to both upload and commit.""" results = [] results.extend(input_api.canned_checks.RunPylint(input_api, output_api, files_to_skip=(r'^base[\\\/].*\.py$', r'^build[\\\/].*\.py$', r'^buildtools[\\\/].*\.py$', r'^ios[\\\/].*\.py$', r'^out.*[\\\/].*\.py$', r'^testing[\\\/].*\.py$', r'^third_party[\\\/].*\.py$', r'^tools[\\\/].*\.py$', # TODO(kjellander): should arguably be checked. r'^tools_libyuv[\\\/]valgrind[\\\/].*\.py$', r'^xcodebuild.*[\\\/].*\.py$',), disabled_warnings=['F0401', # Failed to import x 'E0611', # No package y in x 'W0232', # Class has no __init__ method ], pylintrc='pylintrc')) return results def CheckChangeOnUpload(input_api, output_api): results = [] results.extend(_CommonChecks(input_api, output_api)) results.extend( input_api.canned_checks.CheckGNFormatted(input_api, output_api)) return results def CheckChangeOnCommit(input_api, output_api): results = [] results.extend(_CommonChecks(input_api, output_api)) results.extend(input_api.canned_checks.CheckOwners(input_api, output_api)) results.extend(input_api.canned_checks.CheckChangeWasUploaded( input_api, output_api)) results.extend(input_api.canned_checks.CheckChangeHasDescription( input_api, output_api)) return results libyuv-0.0~git20220104.b91df1a/README.chromium000066400000000000000000000003121416500237200202240ustar00rootroot00000000000000Name: libyuv URL: http://code.google.com/p/libyuv/ Version: 1807 License: BSD License File: LICENSE Description: libyuv is an open source project that includes YUV conversion and scaling functionality.libyuv-0.0~git20220104.b91df1a/README.md000066400000000000000000000012271416500237200170070ustar00rootroot00000000000000**libyuv** is an open source project that includes YUV scaling and conversion functionality. * Scale YUV to prepare content for compression, with point, bilinear or box filter. * Convert to YUV from webcam formats for compression. * Convert to RGB formats for rendering/effects. * Rotate by 90/180/270 degrees to adjust for mobile devices in portrait mode. * Optimized for SSSE3/AVX2 on x86/x64. * Optimized for Neon on Arm. * Optimized for MSA on Mips. ### Development See [Getting started][1] for instructions on how to get started developing. You can also browse the [docs directory][2] for more documentation. [1]: ./docs/getting_started.md [2]: ./docs/ libyuv-0.0~git20220104.b91df1a/build_overrides/000077500000000000000000000000001416500237200207075ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/build_overrides/build.gni000066400000000000000000000050411416500237200225050ustar00rootroot00000000000000# Copyright 2016 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. # Variable that can be used to support multiple build scenarios, like having # Chromium specific targets in a client project's GN file etc. build_with_chromium = false # Some non-Chromium builds don't support building java targets. enable_java_templates = true # Allow using custom suppressions files (currently not used by libyuv). asan_suppressions_file = "//build/sanitizers/asan_suppressions.cc" lsan_suppressions_file = "//build/sanitizers/lsan_suppressions.cc" tsan_suppressions_file = "//build/sanitizers/tsan_suppressions.cc" msan_blacklist_path = rebase_path("//tools_libyuv/msan/blacklist.txt", root_build_dir) ubsan_blacklist_path = rebase_path("//tools_libyuv/ubsan/blacklist.txt", root_build_dir) ubsan_vptr_blacklist_path = rebase_path("//tools_libyuv/ubsan/vptr_blacklist.txt", root_build_dir) # For Chromium, Android 32-bit non-component, non-clang builds hit a 4GiB size # limit, making them requiring symbol_level=2. WebRTC doesn't hit that problem # so we just ignore that assert. See https://crbug.com/648948 for more info. ignore_elf32_limitations = true # Use bundled hermetic Xcode installation maintained by Chromium, # except for local iOS builds where it is unsupported. if (host_os == "mac") { _result = exec_script("//build/mac/should_use_hermetic_xcode.py", [ target_os ], "value") assert(_result != 2, "Do not allow building targets with the default" + "hermetic toolchain if the minimum OS version is not met.") use_system_xcode = _result == 0 } declare_args() { # Tracing support requires //third_party/perfetto. enable_base_tracing = false use_perfetto_client_library = false # Limits the defined //third_party/android_deps targets to only "buildCompile" # and "buildCompileNoDeps" targets. This is useful for third-party # repositories which do not use JUnit tests. For instance, # limit_android_deps == true removes "gn gen" requirement for # //third_party/robolectric . limit_android_deps = false # Allows googletest to pretty-print various absl types. # Defined here rather than in gtest.gni to match chromium. gtest_enable_absl_printers = true } libyuv-0.0~git20220104.b91df1a/build_overrides/gtest.gni000066400000000000000000000014321416500237200225340ustar00rootroot00000000000000# Copyright (c) 2016 The LibYuv project authors. All Rights Reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. # Include support for registering main function in multi-process tests. gtest_include_multiprocess = true # Include support for platform-specific operations across unit tests. gtest_include_platform_test = true # Exclude support for testing Objective C code on OS X and iOS. gtest_include_objc_support = true # Exclude support for flushing coverage files on iOS. gtest_include_ios_coverage = true libyuv-0.0~git20220104.b91df1a/cleanup_links.py000077500000000000000000000075331416500237200207420ustar00rootroot00000000000000#!/usr/bin/env python # Copyright 2017 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. # This is a copy of the file from WebRTC in: # https://chromium.googlesource.com/external/webrtc/+/master/cleanup_links.py """Script to cleanup symlinks created from setup_links.py. Before 177567c518b121731e507e9b9c4049c4dc96e4c8 (#15754) we had a Chromium checkout which we created symlinks into. In order to do clean syncs after landing that change, this script cleans up any old symlinks, avoiding annoying manual cleanup needed in order to complete gclient sync. """ import logging import optparse import os import shelve import subprocess import sys ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) LINKS_DB = 'links' # Version management to make future upgrades/downgrades easier to support. SCHEMA_VERSION = 1 class WebRTCLinkSetup(object): def __init__(self, links_db, dry_run=False): self._dry_run = dry_run self._links_db = links_db def CleanupLinks(self): logging.debug('CleanupLinks') for source, link_path in self._links_db.iteritems(): if source == 'SCHEMA_VERSION': continue if os.path.islink(link_path) or sys.platform.startswith('win'): # os.path.islink() always returns false on Windows # See http://bugs.python.org/issue13143. logging.debug('Removing link to %s at %s', source, link_path) if not self._dry_run: if os.path.exists(link_path): if sys.platform.startswith('win') and os.path.isdir(link_path): subprocess.check_call(['rmdir', '/q', '/s', link_path], shell=True) else: os.remove(link_path) del self._links_db[source] def _initialize_database(filename): links_database = shelve.open(filename) # Wipe the database if this version of the script ends up looking at a # newer (future) version of the links db, just to be sure. version = links_database.get('SCHEMA_VERSION') if version and version != SCHEMA_VERSION: logging.info('Found database with schema version %s while this script only ' 'supports %s. Wiping previous database contents.', version, SCHEMA_VERSION) links_database.clear() links_database['SCHEMA_VERSION'] = SCHEMA_VERSION return links_database def main(): parser = optparse.OptionParser() parser.add_option('-d', '--dry-run', action='store_true', default=False, help='Print what would be done, but don\'t perform any ' 'operations. This will automatically set logging to ' 'verbose.') parser.add_option('-v', '--verbose', action='store_const', const=logging.DEBUG, default=logging.INFO, help='Print verbose output for debugging.') options, _ = parser.parse_args() if options.dry_run: options.verbose = logging.DEBUG logging.basicConfig(format='%(message)s', level=options.verbose) # Work from the root directory of the checkout. script_dir = os.path.dirname(os.path.abspath(__file__)) os.chdir(script_dir) # The database file gets .db appended on some platforms. db_filenames = [LINKS_DB, LINKS_DB + '.db'] if any(os.path.isfile(f) for f in db_filenames): links_database = _initialize_database(LINKS_DB) try: symlink_creator = WebRTCLinkSetup(links_database, options.dry_run) symlink_creator.CleanupLinks() finally: for f in db_filenames: if os.path.isfile(f): os.remove(f) return 0 if __name__ == '__main__': sys.exit(main()) libyuv-0.0~git20220104.b91df1a/codereview.settings000066400000000000000000000003221416500237200214410ustar00rootroot00000000000000# This file is used by `git cl` to get repository specific information. CODE_REVIEW_SERVER: codereview.chromium.org GERRIT_HOST: True PROJECT: libyuv VIEW_VC: https://chromium.googlesource.com/libyuv/libyuv/+/ libyuv-0.0~git20220104.b91df1a/docs/000077500000000000000000000000001416500237200164565ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/docs/deprecated_builds.md000066400000000000000000000314411416500237200224450ustar00rootroot00000000000000# Deprecated Builds Older documentation on build configs which are no longer supported. ## Pre-requisites You'll need to have depot tools installed: https://www.chromium.org/developers/how-tos/install-depot-tools Refer to chromium instructions for each platform for other prerequisites. ## Getting the Code Create a working directory, enter it, and run: gclient config https://chromium.googlesource.com/libyuv/libyuv gclient sync Then you'll get a .gclient file like: solutions = [ { "name" : "libyuv", "url" : "https://chromium.googlesource.com/libyuv/libyuv", "deps_file" : "DEPS", "managed" : True, "custom_deps" : { }, "safesync_url": "", }, ]; For iOS add `;target_os=['ios'];` to your OSX .gclient and run `GYP_DEFINES="OS=ios" gclient sync.` Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master ### Android For Android add `;target_os=['android'];` to your Linux .gclient solutions = [ { "name" : "libyuv", "url" : "https://chromium.googlesource.com/libyuv/libyuv", "deps_file" : "DEPS", "managed" : True, "custom_deps" : { }, "safesync_url": "", }, ]; target_os = ["android", "unix"]; Then run: export GYP_DEFINES="OS=android" gclient sync Caveat: Theres an error with Google Play services updates. If you get the error "Your version of the Google Play services library is not up to date", run the following: cd chromium/src ./build/android/play_services/update.py download cd ../.. For Windows the gclient sync must be done from an Administrator command prompt. The sync will generate native build files for your environment using gyp (Windows: Visual Studio, OSX: XCode, Linux: make). This generation can also be forced manually: `gclient runhooks` To get just the source (not buildable): git clone https://chromium.googlesource.com/libyuv/libyuv ## Building the Library and Unittests ### Windows set GYP_DEFINES=target_arch=ia32 call python gyp_libyuv -fninja -G msvs_version=2013 ninja -j7 -C out\Release ninja -j7 -C out\Debug set GYP_DEFINES=target_arch=x64 call python gyp_libyuv -fninja -G msvs_version=2013 ninja -C out\Debug_x64 ninja -C out\Release_x64 #### Building with clangcl set GYP_DEFINES=clang=1 target_arch=ia32 call python tools\clang\scripts\update.py call python gyp_libyuv -fninja libyuv_test.gyp ninja -C out\Debug ninja -C out\Release ### OSX Clang 64 bit shown. Remove `clang=1` for GCC and change x64 to ia32 for 32 bit. GYP_DEFINES="clang=1 target_arch=x64" ./gyp_libyuv ninja -j7 -C out/Debug ninja -j7 -C out/Release GYP_DEFINES="clang=1 target_arch=ia32" ./gyp_libyuv ninja -j7 -C out/Debug ninja -j7 -C out/Release ### iOS http://www.chromium.org/developers/how-tos/build-instructions-ios Add to .gclient last line: `target_os=['ios'];` armv7 GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest arm64 GYP_DEFINES="OS=ios target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest both armv7 and arm64 (fat) GYP_DEFINES="OS=ios target_arch=armv7 target_subarch=both" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_ios" ./gyp_libyuv ninja -j7 -C out_ios/Debug-iphoneos libyuv_unittest ninja -j7 -C out_ios/Release-iphoneos libyuv_unittest simulator GYP_DEFINES="OS=ios target_arch=ia32 target_subarch=arm32" GYP_CROSSCOMPILE=1 GYP_GENERATOR_FLAGS="output_dir=out_sim" ./gyp_libyuv ninja -j7 -C out_sim/Debug-iphonesimulator libyuv_unittest ninja -j7 -C out_sim/Release-iphonesimulator libyuv_unittest ### Android https://code.google.com/p/chromium/wiki/AndroidBuildInstructions Add to .gclient last line: `target_os=['android'];` armv7 GYP_DEFINES="OS=android" GYP_CROSSCOMPILE=1 ./gyp_libyuv ninja -j7 -C out/Debug yuv_unittest_apk ninja -j7 -C out/Release yuv_unittest_apk arm64 GYP_DEFINES="OS=android target_arch=arm64 target_subarch=arm64" GYP_CROSSCOMPILE=1 ./gyp_libyuv ninja -j7 -C out/Debug yuv_unittest_apk ninja -j7 -C out/Release yuv_unittest_apk ia32 GYP_DEFINES="OS=android target_arch=ia32" GYP_CROSSCOMPILE=1 ./gyp_libyuv ninja -j7 -C out/Debug yuv_unittest_apk ninja -j7 -C out/Release yuv_unittest_apk GYP_DEFINES="OS=android target_arch=ia32 android_full_debug=1" GYP_CROSSCOMPILE=1 ./gyp_libyuv ninja -j7 -C out/Debug yuv_unittest_apk mipsel GYP_DEFINES="OS=android target_arch=mipsel" GYP_CROSSCOMPILE=1 ./gyp_libyuv ninja -j7 -C out/Debug yuv_unittest_apk ninja -j7 -C out/Release yuv_unittest_apk arm32 disassembly: third_party/android_ndk/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-objdump -d out/Release/obj/source/libyuv.row_neon.o arm64 disassembly: third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d out/Release/obj/source/libyuv.row_neon64.o Running tests: build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* Running test as benchmark: build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1" Running test with C code: build/android/test_runner.py gtest -s libyuv_unittest -t 7200 --verbose --release --gtest_filter=* -a "--libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1" #### Building with GN gn gen out/Release "--args=is_debug=false target_cpu=\"x86\"" gn gen out/Debug "--args=is_debug=true target_cpu=\"x86\"" ninja -C out/Release ninja -C out/Debug ### Building Offical with GN gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true" ninja -C out/Official #### Building mips with GN mipsel gn gen out/Default "--args=is_debug=false target_cpu=\"mipsel\" target_os = \"android\" mips_arch_variant = \"r6\" mips_use_msa = true is_component_build = true is_clang = false" ninja -C out/Default mips64el gn gen out/Default "--args=is_debug=false target_cpu=\"mips64el\" target_os = \"android\" mips_arch_variant = \"r6\" mips_use_msa = true is_component_build = true is_clang = false" ninja -C out/Default ### Linux GYP_DEFINES="target_arch=x64" ./gyp_libyuv ninja -j7 -C out/Debug ninja -j7 -C out/Release GYP_DEFINES="target_arch=ia32" ./gyp_libyuv ninja -j7 -C out/Debug ninja -j7 -C out/Release #### CentOS On CentOS 32 bit the following work around allows a sync: export GYP_DEFINES="host_arch=ia32" gclient sync ### Windows Shared Library Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. gclient runhooks After this command follow the building the library instructions above. If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows ### Build targets ninja -C out/Debug libyuv ninja -C out/Debug libyuv_unittest ninja -C out/Debug compare ninja -C out/Debug yuvconvert ninja -C out/Debug yuvconstants ninja -C out/Debug psnr ninja -C out/Debug cpuid ## Building the Library with make ### Linux make -j7 V=1 -f linux.mk make -j7 V=1 -f linux.mk clean make -j7 V=1 -f linux.mk CXX=clang++ ## Building the Library with cmake Install cmake: http://www.cmake.org/ Default debug build: mkdir out cd out cmake .. cmake --build . Release build/install mkdir out cd out cmake -DCMAKE_INSTALL_PREFIX="/usr/lib" -DCMAKE_BUILD_TYPE="Release" .. cmake --build . --config Release sudo cmake --build . --target install --config Release ### Windows 8 Phone Pre-requisite: * Install Visual Studio 2012 and Arm to your environment.
Then: call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat" or with Visual Studio 2013: call "c:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\x86_arm\vcvarsx86_arm.bat" nmake /f winarm.mk clean nmake /f winarm.mk ### Windows Shared Library Modify libyuv.gyp from 'static_library' to 'shared_library', and add 'LIBYUV_BUILDING_SHARED_LIBRARY' to 'defines'. Then run this. gclient runhooks After this command follow the building the library instructions above. If you get a compile error for atlthunk.lib on Windows, read http://www.chromium.org/developers/how-tos/build-instructions-windows ### 64 bit Windows set GYP_DEFINES=target_arch=x64 gclient runhooks V=1 ### ARM Linux export GYP_DEFINES="target_arch=arm" export CROSSTOOL=``/arm-none-linux-gnueabi export CXX=$CROSSTOOL-g++ export CC=$CROSSTOOL-gcc export AR=$CROSSTOOL-ar export AS=$CROSSTOOL-as export RANLIB=$CROSSTOOL-ranlib gclient runhooks ## Running Unittests ### Windows out\Release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter="*" ### OSX out/Release/libyuv_unittest --gtest_filter="*" ### Linux out/Release/libyuv_unittest --gtest_filter="*" Replace --gtest_filter="*" with specific unittest to run. May include wildcards. e.g. out/Release/libyuv_unittest --gtest_filter=libyuvTest.I420ToARGB_Opt ## CPU Emulator tools ### Intel SDE (Software Development Emulator) Pre-requisite: Install IntelSDE for Windows: http://software.intel.com/en-us/articles/intel-software-development-emulator Then run: c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=* ## Memory tools ### Running Dr Memory memcheck for Windows Pre-requisite: Install Dr Memory for Windows and add it to your path: http://www.drmemory.org/docs/page_install_windows.html set GYP_DEFINES=build_for_tool=drmemory target_arch=ia32 call python gyp_libyuv -fninja -G msvs_version=2013 ninja -C out\Debug drmemory out\Debug\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=* ### Running UBSan See Chromium instructions for sanitizers: https://www.chromium.org/developers/testing/undefinedbehaviorsanitizer Sanitizers available: TSan, MSan, ASan, UBSan, LSan GYP_DEFINES='ubsan=1' gclient runhooks ninja -C out/Release ### Running Valgrind memcheck Memory errors and race conditions can be found by running tests under special memory tools. [Valgrind] [1] is an instrumentation framework for building dynamic analysis tools. Various tests and profilers are built upon it to find memory handling errors and memory leaks, for instance. [1]: http://valgrind.org solutions = [ { "name" : "libyuv", "url" : "https://chromium.googlesource.com/libyuv/libyuv", "deps_file" : "DEPS", "managed" : True, "custom_deps" : { "libyuv/chromium/src/third_party/valgrind": "https://chromium.googlesource.com/chromium/deps/valgrind/binaries", }, "safesync_url": "", }, ] Then run: GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=memcheck" python gyp_libyuv ninja -C out/Debug valgrind out/Debug/libyuv_unittest For more information, see http://www.chromium.org/developers/how-tos/using-valgrind ### Running Thread Sanitizer (TSan) GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=tsan" python gyp_libyuv ninja -C out/Debug valgrind out/Debug/libyuv_unittest For more info, see http://www.chromium.org/developers/how-tos/using-valgrind/threadsanitizer ### Running Address Sanitizer (ASan) GYP_DEFINES="clang=0 target_arch=x64 build_for_tool=asan" python gyp_libyuv ninja -C out/Debug valgrind out/Debug/libyuv_unittest For more info, see http://dev.chromium.org/developers/testing/addresssanitizer ## Benchmarking The unittests can be used to benchmark. ### Windows set LIBYUV_WIDTH=1280 set LIBYUV_HEIGHT=720 set LIBYUV_REPEAT=999 set LIBYUV_FLAGS=-1 out\Release\libyuv_unittest.exe --gtest_filter=*I420ToARGB_Opt ### Linux and Mac LIBYUV_WIDTH=1280 LIBYUV_HEIGHT=720 LIBYUV_REPEAT=1000 out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt libyuvTest.I420ToARGB_Opt (547 ms) Indicates 0.547 ms/frame for 1280 x 720. ## Making a change gclient sync git checkout -b mycl -t origin/master git pull git add -u git commit -m "my change" git cl lint git cl try git cl upload -r a-reviewer@chomium.org -s git cl land libyuv-0.0~git20220104.b91df1a/docs/environment_variables.md000066400000000000000000000026021416500237200233740ustar00rootroot00000000000000# Introduction For test purposes, environment variables can be set to control libyuv behavior. These should only be used for testing, to narrow down bugs or to test performance. # CPU By default the cpu is detected and the most advanced form of SIMD is used. But you can disable instruction sets selectively, or completely, falling back on C code. Set the variable to 1 to disable the specified instruction set. ## All CPUs LIBYUV_DISABLE_ASM ## Intel CPUs LIBYUV_DISABLE_X86 LIBYUV_DISABLE_SSE2 LIBYUV_DISABLE_SSSE3 LIBYUV_DISABLE_SSE41 LIBYUV_DISABLE_SSE42 LIBYUV_DISABLE_AVX LIBYUV_DISABLE_AVX2 LIBYUV_DISABLE_ERMS LIBYUV_DISABLE_FMA3 LIBYUV_DISABLE_F16C LIBYUV_DISABLE_AVX512BW LIBYUV_DISABLE_AVX512VL LIBYUV_DISABLE_AVX512VBMI LIBYUV_DISABLE_AVX512VBMI2 LIBYUV_DISABLE_AVX512VBITALG LIBYUV_DISABLE_AVX512VPOPCNTDQ LIBYUV_DISABLE_GFNI ## ARM CPUs LIBYUV_DISABLE_NEON ## MIPS CPUs LIBYUV_DISABLE_MSA LIBYUV_DISABLE_MMI # Test Width/Height/Repeat The unittests default to a small image (128x72) to run fast. This can be set by environment variable to test a specific resolutions. You can also repeat the test a specified number of iterations, allowing benchmarking and profiling. set LIBYUV_WIDTH=1280 set LIBYUV_HEIGHT=720 set LIBYUV_REPEAT=999 set LIBYUV_FLAGS=-1 set LIBYUV_CPU_INFO=-1 libyuv-0.0~git20220104.b91df1a/docs/filtering.md000066400000000000000000000241711416500237200207700ustar00rootroot00000000000000# Introduction This document discusses the current state of filtering in libyuv. An emphasis on maximum performance while avoiding memory exceptions, and minimal amount of code/complexity. See future work at end. # LibYuv Filter Subsampling There are 2 challenges with subsampling 1. centering of samples, which involves clamping on edges 2. clipping a source region Centering depends on scale factor and filter mode. # Down Sampling If scaling down, the stepping rate is always src_width / dst_width. dx = src_width / dst_width; e.g. If scaling from 1280x720 to 640x360, the step thru the source will be 2.0, stepping over 2 pixels of source for each pixel of destination. Centering, depends on filter mode. *Point* downsampling takes the middle pixel. x = dx >> 1; For odd scale factors (e.g. 3x down) this is exactly the middle. For even scale factors, this rounds up and takes the pixel to the right of center. e.g. scale of 4x down will take pixel 2. **Bilinear** filter, uses the 2x2 pixels in the middle. x = dx / 2 - 0.5; For odd scale factors (e.g. 3x down) this is exactly the middle, and point sampling is used. For even scale factors, this evenly filters the middle 2x2 pixels. e.g. 4x down will filter pixels 1,2 at 50% in both directions. **Box** filter averages the entire box so sampling starts at 0. x = 0; For a scale factor of 2x down, this is equivalent to bilinear. # Up Sampling **Point** upsampling use stepping rate of src_width / dst_width and a starting coordinate of 0. x = 0; dx = src_width / dst_width; e.g. If scaling from 640x360 to 1280x720 the step thru the source will be 0.0, stepping half a pixel of source for each pixel of destination. Each pixel is replicated by the scale factor. **Bilinear** filter stretches such that the first pixel of source maps to the first pixel of destination, and the last pixel of source maps to the last pixel of destination. x = 0; dx = (src_width - 1) / (dst_width - 1); This method is not technically correct, and will likely change in the future. * It is inconsistent with the bilinear down sampler. The same method could be used for down sampling, and then it would be more reversible, but that would prevent specialized 2x down sampling. * Although centered, the image is slightly magnified. * The filtering was changed in early 2013 - previously it used: x = 0; dx = (src_width - 1) / (dst_width - 1); Which is the correct scale factor, but shifted the image left, and extruded the last pixel. The reason for the change was to remove the extruding code from the low level row functions, allowing 3 functions to sshare the same row functions - ARGBScale, I420Scale, and ARGBInterpolate. Then the one function was ported to many cpu variations: SSE2, SSSE3, AVX2, Neon and 'Any' version for any number of pixels and alignment. The function is also specialized for 0,25,50,75%. The above goes still has the potential to read the last pixel 100% and last pixel + 1 0%, which may cause a memory exception. So the left pixel goes to a fraction less than the last pixel, but filters in the minimum amount of it, and the maximum of the last pixel. dx = FixedDiv((src_width << 16) - 0x00010001, (dst << 16) - 0x00010000); **Box** filter for upsampling switches over to Bilinear. # Scale snippet: #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) #define FIXEDDIV1(src, dst) FixedDiv((src << 16) - 0x00010001, \ (dst << 16) - 0x00010000); // Compute slope values for stepping. void ScaleSlope(int src_width, int src_height, int dst_width, int dst_height, FilterMode filtering, int* x, int* y, int* dx, int* dy) { assert(x != NULL); assert(y != NULL); assert(dx != NULL); assert(dy != NULL); assert(src_width != 0); assert(src_height != 0); assert(dst_width > 0); assert(dst_height > 0); if (filtering == kFilterBox) { // Scale step for point sampling duplicates all pixels equally. *dx = FixedDiv(Abs(src_width), dst_width); *dy = FixedDiv(src_height, dst_height); *x = 0; *y = 0; } else if (filtering == kFilterBilinear) { // Scale step for bilinear sampling renders last pixel once for upsample. if (dst_width <= Abs(src_width)) { *dx = FixedDiv(Abs(src_width), dst_width); *x = CENTERSTART(*dx, -32768); } else if (dst_width > 1) { *dx = FIXEDDIV1(Abs(src_width), dst_width); *x = 0; } if (dst_height <= src_height) { *dy = FixedDiv(src_height, dst_height); *y = CENTERSTART(*dy, -32768); // 32768 = -0.5 to center bilinear. } else if (dst_height > 1) { *dy = FIXEDDIV1(src_height, dst_height); *y = 0; } } else if (filtering == kFilterLinear) { // Scale step for bilinear sampling renders last pixel once for upsample. if (dst_width <= Abs(src_width)) { *dx = FixedDiv(Abs(src_width), dst_width); *x = CENTERSTART(*dx, -32768); } else if (dst_width > 1) { *dx = FIXEDDIV1(Abs(src_width), dst_width); *x = 0; } *dy = FixedDiv(src_height, dst_height); *y = *dy >> 1; } else { // Scale step for point sampling duplicates all pixels equally. *dx = FixedDiv(Abs(src_width), dst_width); *dy = FixedDiv(src_height, dst_height); *x = CENTERSTART(*dx, 0); *y = CENTERSTART(*dy, 0); } // Negative src_width means horizontally mirror. if (src_width < 0) { *x += (dst_width - 1) * *dx; *dx = -*dx; src_width = -src_width; } } # Future Work Point sampling should ideally be the same as bilinear, but pixel by pixel, round to nearest neighbor. But as is, it is reversible and exactly matches ffmpeg at all scale factors, both up and down. The scale factor is dx = src_width / dst_width; The step value is centered for down sample: x = dx / 2; Or starts at 0 for upsample. x = 0; Bilinear filtering is currently correct for down sampling, but not for upsampling. Upsampling is stretching the first and last pixel of source, to the first and last pixel of destination. dx = (src_width - 1) / (dst_width - 1);
x = 0; It should be stretching such that the first pixel is centered in the middle of the scale factor, to match the pixel that would be sampled for down sampling by the same amount. And same on last pixel. dx = src_width / dst_width;
x = dx / 2 - 0.5; This would start at -0.5 and go to last pixel + 0.5, sampling 50% from last pixel + 1. Then clamping would be needed. On GPUs there are numerous ways to clamp. 1. Clamp the coordinate to the edge of the texture, duplicating the first and last pixel. 2. Blend with a constant color, such as transparent black. Typically best for fonts. 3. Mirror the UV coordinate, which is similar to clamping. Good for continuous tone images. 4. Wrap the coordinate, for texture tiling. 5. Allow the coordinate to index beyond the image, which may be the correct data if sampling a subimage. 6. Extrapolate the edge based on the previous pixel. pixel -0.5 is computed from slope of pixel 0 and 1. Some of these are computational, even for a GPU, which is one reason textures are sometimes limited to power of 2 sizes. We do care about the clipping case, where allowing coordinates to become negative and index pixels before the image is the correct data. But normally for simple scaling, we want to clamp to the edge pixel. For example, if bilinear scaling from 3x3 to 30x30, we’d essentially want 10 pixels of each of the original 3 pixels. But we want the original pixels to land in the middle of each 10 pixels, at offsets 5, 15 and 25. There would be filtering between 5 and 15 between the original pixels 0 and 1. And filtering between 15 and 25 from original pixels 1 and 2. The first 5 pixels are clamped to pixel 0 and the last 5 pixels are clamped to pixel 2. The easiest way to implement this is copy the original 3 pixels to a buffer, and duplicate the first and last pixels. 0,1,2 becomes 0, 0,1,2, 2. Then implement a filtering without clamping. We call this source extruding. Its only necessary on up sampling, since down sampler will always have valid surrounding pixels. Extruding is practical when the image is already copied to a temporary buffer. It could be done to the original image, as long as the original memory is restored, but valgrind and/or memory protection would disallow this, so it requires a memcpy to a temporary buffer, which may hurt performance. The memcpy has a performance advantage, from a cache point of view, that can actually make this technique faster, depending on hardware characteristics. Vertical extrusion can be done with a memcpy of the first/last row, or clamping a pointer. The other way to implement clamping is handle the edges with a memset. e.g. Read first source pixel and memset the first 5 pixels. Filter pixels 0,1,2 to 5 to 25. Read last pixel and memset the last 5 pixels. Blur is implemented with this method like this, which has 3 loops per row - left, middle and right. Box filter is only used for 2x down sample or more. Its based on integer sized boxes. Technically it should be filtered edges, but thats substantially slower (roughly 100x), and at that point you may as well do a cubic filter which is more correct. Box filter currently sums rows into a row buffer. It does this with Mirroring will use the same slope as normal, but with a negative. The starting coordinate needs to consider the scale factor and filter. e.g. box filter of 30x30 to 3x3 with mirroring would use -10 for step, but x = 20. width (30) - dx. Step needs to be accurate, so it uses an integer divide. This is as much as 5% of the profile. An approximated divide is substantially faster, but the inaccuracy causes stepping beyond the original image boundaries. 3 general solutions: 1. copy image to buffer with padding. allows for small errors in stepping. 2. hash the divide, so common values are quickly found. 3. change api so caller provides the slope. libyuv-0.0~git20220104.b91df1a/docs/formats.md000066400000000000000000000225371416500237200204640ustar00rootroot00000000000000# Introduction Formats (FOURCC) supported by libyuv are detailed here. # Core Formats There are 2 core formats supported by libyuv - I420 and ARGB. All YUV formats can be converted to/from I420. All RGB formats can be converted to/from ARGB. Filtering functions such as scaling and planar functions work on I420 and/or ARGB. # OSX Core Media Pixel Formats This is how OSX formats map to libyuv enum { kCMPixelFormat_32ARGB = 32, FOURCC_BGRA kCMPixelFormat_32BGRA = 'BGRA', FOURCC_ARGB kCMPixelFormat_24RGB = 24, FOURCC_RAW kCMPixelFormat_16BE555 = 16, Not supported. kCMPixelFormat_16BE565 = 'B565', Not supported. kCMPixelFormat_16LE555 = 'L555', FOURCC_RGBO kCMPixelFormat_16LE565 = 'L565', FOURCC_RGBP kCMPixelFormat_16LE5551 = '5551', FOURCC_RGBO kCMPixelFormat_422YpCbCr8 = '2vuy', FOURCC_UYVY kCMPixelFormat_422YpCbCr8_yuvs = 'yuvs', FOURCC_YUY2 kCMPixelFormat_444YpCbCr8 = 'v308', FOURCC_I444 ? kCMPixelFormat_4444YpCbCrA8 = 'v408', Not supported. kCMPixelFormat_422YpCbCr16 = 'v216', Not supported. kCMPixelFormat_422YpCbCr10 = 'v210', FOURCC_V210 previously. Removed now. kCMPixelFormat_444YpCbCr10 = 'v410', Not supported. kCMPixelFormat_8IndexedGray_WhiteIsZero = 0x00000028, Not supported. }; # FOURCC (Four Charactacter Code) List The following is extracted from video_common.h as a complete list of formats supported by libyuv. enum FourCC { // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. FOURCC_I420 = FOURCC('I', '4', '2', '0'), FOURCC_I422 = FOURCC('I', '4', '2', '2'), FOURCC_I444 = FOURCC('I', '4', '4', '4'), FOURCC_I400 = FOURCC('I', '4', '0', '0'), FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb FOURCC_U010 = FOURCC('U', '0', '1', '0'), // bt.2020, unofficial fourcc. // 10 bit lsb // 1 Secondary YUV format: row biplanar. FOURCC_M420 = FOURCC('M', '4', '2', '0'), // deprecated. // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc, 2 64 bpp FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010. FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit FOURCC_AR64 = FOURCC('A', 'R', '6', '4'), // 16 bit per channel. FOURCC_AB64 = FOURCC('A', 'B', '6', '4'), // ABGR version of 16 bit FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. // 1 Primary Compressed YUV format. FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), // 11 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. FOURCC_J420 = FOURCC('J', '4', '2', '0'), FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc FOURCC_H422 = FOURCC('H', '4', '2', '2'), // unofficial fourcc FOURCC_U420 = FOURCC('U', '4', '2', '0'), // bt.2020, unofficial fourcc FOURCC_U422 = FOURCC('U', '4', '2', '2'), // bt.2020, unofficial fourcc FOURCC_U444 = FOURCC('U', '4', '4', '4'), // bt.2020, unofficial fourcc // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422. FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444. FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2. FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac. FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY. FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac. FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG. FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac. FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR. FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW. FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG. FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO. FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP. FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO. # Planar YUV The following formats contains a full size Y plane followed by 1 or 2 planes for UV: I420, I422, I444, I400, NV21, NV12, I400 The size (subsampling) of the UV varies. I420, NV12 and NV21 are half width, half height I422, NV16 and NV61 are half width, full height I444, NV24 and NV42 are full width, full height I400 and J400 have no chroma channel. # Color space The YUV formats start with a letter to specify the color space. e.g. I420 I = BT.601 limited range J = BT.601 full range (J = JPEG that uses this) H = BT.709 limited range (H for HD) F = BT.709 full range (F for Full range) U = BT.2020 limited range (U for UHD) V = BT.2020 full range For YUV to RGB conversions, a matrix can be passed. See also convert_argh.h # HDR formats Planar formats with 10 or 12 bits use the following fourcc: I010, I012, P010, P012 are half width, half height I210, I212, P210, P212 are half width, full height I410, I412, P410, P412 are full width, full height where I is the color space (see above) and 3 planes: Y, U and V. P is a biplanar format, similar to NV12 but 16 bits, with the valid bits in the high bits. There is a Y plane and a UV plane. 0, 2 or 4 is the last digit of subsampling: 4:2:0, 4:2:2, or 4:4:4 10 or 12 is the bits per channel. The bits are in the low bits of a 16 bit channel. # The ARGB FOURCC There are 4 ARGB layouts - ARGB, BGRA, ABGR and RGBA. ARGB is most common by far, used for screen formats, and windows webcam drivers. The fourcc describes the order of channels in a ***register***. A fourcc provided by capturer, can be thought of string, e.g. "ARGB". On little endian machines, as an int, this would have 'A' in the lowest byte. The FOURCC macro reverses the order: #define FOURCC(a, b, c, d) (((uint32)(a)) | ((uint32)(b) << 8) | ((uint32)(c) << 16) | ((uint32)(d) << 24)) So the "ARGB" string, read as an uint32, is FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B') If you were to read ARGB pixels as uint32's, the alpha would be in the high byte, and the blue in the lowest byte. In memory, these are stored little endian, so 'B' is first, then 'G', 'R' and 'A' last. When calling conversion functions, the names match the FOURCC, so in this case it would be I420ToARGB(). All formats can be converted to/from ARGB. Most 'planar_functions' work on ARGB (e.g. ARGBBlend). Some are channel order agnostic (e.g. ARGBScale). Some functions are symmetric (e.g. ARGBToBGRA is the same as BGRAToARGB, so its a macro). ARGBBlend expects preattenuated ARGB. The R,G,B are premultiplied by alpha. Other functions don't care. # RGB24 and RAW There are 2 RGB layouts - RGB24 (aka 24BG) and RAW RGB24 is B,G,R in memory RAW is R,G,B in memory # AR30 and XR30 AR30 is 2 10 10 10 ARGB stored in little endian order. The 2 bit alpha has 4 values. Here are the comparable 8 bit alpha values. 0 - 0. 00000000b = 0x00 = 0 1 - 33%. 01010101b = 0x55 = 85 2 - 66%. 10101010b = 0xaa = 170 3 - 100%. 11111111b = 0xff = 255 The 10 bit RGB values range from 0 to 1023. XR30 is the same as AR30 but with no alpha channel. # AB64 and AR64 AB64 is similar to ABGR, with 16 bit (2 bytes) per channel. Each channel stores an unsigned short. In memory R is the lowest and A is the highest. Each channel has value ranges from 0 to 65535. AR64 is similar to ARGB. # NV12 and NV21 NV12 is a biplanar format with a full sized Y plane followed by a single chroma plane with weaved U and V values. NV21 is the same but with weaved V and U values. The 12 in NV12 refers to 12 bits per pixel. NV12 has a half width and half height chroma channel, and therefore is a 420 subsampling. NV16 is 16 bits per pixel, with half width and full height. aka 422. NV24 is 24 bits per pixel with full sized chroma channel. aka 444. Most NV12 functions allow the destination Y pointer to be NULL. # YUY2 and UYVY YUY2 is a packed YUV format with half width, full height. YUY2 is YUYV in memory UYVY is UYVY in memory libyuv-0.0~git20220104.b91df1a/docs/getting_started.md000066400000000000000000000220551416500237200221730ustar00rootroot00000000000000# Getting Started How to get and build the libyuv code. ## Pre-requisites You'll need to have depot tools installed: https://www.chromium.org/developers/how-tos/install-depot-tools Refer to chromium instructions for each platform for other prerequisites. ## Getting the Code Create a working directory, enter it, and run: gclient config --name src https://chromium.googlesource.com/libyuv/libyuv gclient sync Then you'll get a .gclient file like: solutions = [ { "name" : "src", "url" : "https://chromium.googlesource.com/libyuv/libyuv", "deps_file" : "DEPS", "managed" : True, "custom_deps" : { }, "safesync_url": "", }, ]; For iOS add `;target_os=['ios'];` to your OSX .gclient and run `gclient sync.` Browse the Git reprository: https://chromium.googlesource.com/libyuv/libyuv/+/master ### Android For Android add `;target_os=['android'];` to your Linux .gclient solutions = [ { "name" : "src", "url" : "https://chromium.googlesource.com/libyuv/libyuv", "deps_file" : "DEPS", "managed" : True, "custom_deps" : { }, "safesync_url": "", }, ]; target_os = ["android", "linux"]; Then run: gclient sync To get just the source (not buildable): git clone https://chromium.googlesource.com/libyuv/libyuv ## Building the Library and Unittests ### Windows call gn gen out\Release "--args=is_debug=false target_cpu=\"x64\"" call gn gen out\Debug "--args=is_debug=true target_cpu=\"x64\"" ninja -v -C out\Release ninja -v -C out\Debug call gn gen out\Release "--args=is_debug=false target_cpu=\"x86\"" call gn gen out\Debug "--args=is_debug=true target_cpu=\"x86\"" ninja -v -C out\Release ninja -v -C out\Debug ### macOS and Linux gn gen out/Release "--args=is_debug=false" gn gen out/Debug "--args=is_debug=true" ninja -v -C out/Release ninja -v -C out/Debug ### Building Offical with GN gn gen out/Official "--args=is_debug=false is_official_build=true is_chrome_branded=true" ninja -C out/Official ### iOS http://www.chromium.org/developers/how-tos/build-instructions-ios Add to .gclient last line: `target_os=['ios'];` arm64 gn gen out/Release "--args=is_debug=false target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"arm64\"" gn gen out/Debug "--args=is_debug=true target_os=\"ios\" ios_enable_code_signing=false target_cpu=\"arm64\"" ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest ios simulator gn gen out/Release "--args=is_debug=false target_os=\"ios\" ios_enable_code_signing=false use_xcode_clang=true target_cpu=\"x86\"" gn gen out/Debug "--args=is_debug=true target_os=\"ios\" ios_enable_code_signing=false use_xcode_clang=true target_cpu=\"x86\"" ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest ios disassembly otool -tV ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt ### Android https://code.google.com/p/chromium/wiki/AndroidBuildInstructions Add to .gclient last line: `target_os=['android'];` arm64 gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm64\"" gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm64\"" ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest armv7 gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"arm\"" gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"arm\"" ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest ia32 gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"x86\"" gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"x86\"" ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest mips gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true" gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true" ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest arm disassembly: third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv/row_common.o >row_common.txt third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon.o >row_neon.txt third_party/android_ndk/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-objdump -d ./out/Release/obj/libyuv_neon/row_neon64.o >row_neon64.txt Caveat: Disassembly may require optimize_max be disabled in BUILD.gn Running tests: out/Release/bin/run_libyuv_unittest -vv --gtest_filter=* Running test as benchmark: out/Release/bin/run_libyuv_unittest -vv --gtest_filter=* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=-1 --libyuv_cpu_info=-1 Running test with C code: out/Release/bin/run_libyuv_unittest -vv --gtest_filter=* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=999 --libyuv_flags=1 --libyuv_cpu_info=1 ### Build targets ninja -C out/Debug libyuv ninja -C out/Debug libyuv_unittest ninja -C out/Debug compare ninja -C out/Debug yuvconvert ninja -C out/Debug yuvconstants ninja -C out/Debug psnr ninja -C out/Debug cpuid ### ARM Linux gn gen out/Release "--args=is_debug=false target_cpu=\"arm64\"" gn gen out/Debug "--args=is_debug=true target_cpu=\"arm64\"" ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest ### MIPS Linux mips gn gen out/Release "--args=is_debug=false target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false" gn gen out/Debug "--args=is_debug=true target_os=\"linux\" target_cpu=\"mips64el\" mips_arch_variant=\"loongson3\" mips_use_mmi=true is_component_build=false use_sysroot=false use_gold=false" ninja -v -C out/Debug libyuv_unittest ninja -v -C out/Release libyuv_unittest ## Building the Library with make ### Linux make V=1 -f linux.mk make V=1 -f linux.mk clean make V=1 -f linux.mk CXX=clang++ CC=clang ## Building the library with cmake Install cmake: http://www.cmake.org/ ### Default debug build: mkdir out cd out cmake .. cmake --build . ### Release build/install mkdir out cd out cmake -DCMAKE_INSTALL_PREFIX="/usr/lib" -DCMAKE_BUILD_TYPE="Release" .. cmake --build . --config Release sudo cmake --build . --target install --config Release ### Build RPM/DEB packages mkdir out cd out cmake -DCMAKE_BUILD_TYPE=Release .. make -j4 make package ## Setup for Arm Cross compile See also https://www.ccoderun.ca/programming/2015-12-20_CrossCompiling/index.html sudo apt-get install ssh dkms build-essential linux-headers-generic sudo apt-get install kdevelop cmake git subversion sudo apt-get install graphviz doxygen doxygen-gui sudo apt-get install manpages manpages-dev manpages-posix manpages-posix-dev sudo apt-get install libboost-all-dev libboost-dev libssl-dev sudo apt-get install rpm terminator fish sudo apt-get install g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf ### Build psnr tool cd util arm-linux-gnueabihf-g++ psnr_main.cc psnr.cc ssim.cc -o psnr arm-linux-gnueabihf-objdump -d psnr ## Running Unittests ### Windows out\Release\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter="*" ### macOS and Linux out/Release/libyuv_unittest --gtest_filter="*" Replace --gtest_filter="*" with specific unittest to run. May include wildcards. out/Release/libyuv_unittest --gtest_filter=*I420ToARGB_Opt ## CPU Emulator tools ### Intel SDE (Software Development Emulator) Pre-requisite: Install IntelSDE: http://software.intel.com/en-us/articles/intel-software-development-emulator Then run: c:\intelsde\sde -hsw -- out\Release\libyuv_unittest.exe --gtest_filter=* ~/intelsde/sde -skx -- out/Release/libyuv_unittest --gtest_filter=**I420ToARGB_Opt ### Intel Architecture Code Analyzer Inset these 2 macros into assembly code to be analyzed: IACA_ASM_START IACA_ASM_END Build the code as usual, then run iaca on the object file. ~/iaca-lin64/bin/iaca.sh -reduceout -arch HSW out/Release/obj/libyuv_internal/compare_gcc.o ## Sanitizers gn gen out/Release "--args=is_debug=false is_msan=true" ninja -v -C out/Release Sanitizers available: asan, msan, tsan, ubsan, lsan, ubsan_vptr ### Running Dr Memory memcheck for Windows Pre-requisite: Install Dr Memory for Windows and add it to your path: http://www.drmemory.org/docs/page_install_windows.html drmemory out\Debug\libyuv_unittest.exe --gtest_catch_exceptions=0 --gtest_filter=* libyuv-0.0~git20220104.b91df1a/docs/rotation.md000066400000000000000000000114551416500237200206450ustar00rootroot00000000000000# Introduction Rotation by multiplies of 90 degrees allows mobile devices to rotate webcams from landscape to portrait. The higher level functions ConvertToI420 and ConvertToARGB allow rotation of any format. Optimized functionality is supported for I420, ARGB, NV12 and NV21. # ConvertToI420 int ConvertToI420(const uint8* src_frame, size_t src_size, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int crop_x, int crop_y, int src_width, int src_height, int crop_width, int crop_height, enum RotationMode rotation, uint32 format); This function crops, converts, and rotates. You should think of it in that order. * Crops the original image, which is src_width x src_height, to crop_width x crop_height. At this point the image is still not rotated. * Converts the cropped region to I420. Supports inverted source for src_height negative. * Rotates by 90, 180 or 270 degrees. The buffer the caller provides should account for rotation. Be especially important to get stride of the destination correct. e.g. 640 x 480 NV12 captured
Crop to 640 x 360
Rotate by 90 degrees to 360 x 640.
Caller passes stride of 360 for Y and 360 / 2 for U and V.
Caller passes crop_width of 640, crop_height of 360.
# ConvertToARGB int ConvertToARGB(const uint8* src_frame, size_t src_size, uint8* dst_argb, int dst_stride_argb, int crop_x, int crop_y, int src_width, int src_height, int crop_width, int crop_height, enum RotationMode rotation, uint32 format); Same as I420, but implementation is less optimized - reads columns and writes rows, 16 bytes at a time. # I420Rotate int I420Rotate(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int src_width, int src_height, enum RotationMode mode); Destination is rotated, so pass dst_stride_y etc that consider rotation.
Rotate by 180 can be done in place, but 90 and 270 can not. Implementation (Neon/SSE2) uses 8 x 8 block transpose, so best efficiency is with sizes and pointers that are aligned to 8. Cropping can be achieved by adjusting the src_y/u/v pointers and src_width, src_height. Lower level plane functions are provided, allowing other planar formats to be rotated. (e.g. I444) For other planar YUV formats (I444, I422, I411, I400, NV16, NV24), the planar functions are exposed and can be called directly // Rotate a plane by 0, 90, 180, or 270. int RotatePlane(const uint8* src, int src_stride, uint8* dst, int dst_stride, int src_width, int src_height, enum RotationMode mode); # ARGBRotate LIBYUV_API int ARGBRotate(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int src_width, int src_height, enum RotationMode mode); Same as I420, but implementation is less optimized - reads columns and writes rows. Rotate by 90, or any angle, can be achieved using ARGBAffine. # Mirror - Horizontal Flip Mirror functions for horizontally flipping an image, which can be useful for 'self view' of a webcam. int I420Mirror(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, uint8* dst_y, int dst_stride_y, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height); int ARGBMirror(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height); Mirror functionality can also be achieved with the I420Scale and ARGBScale functions by passing negative width and/or height. # Invert - Vertical Flip Inverting can be achieved with almost any libyuv function by passing a negative source height. I420Mirror and ARGBMirror can also be used to rotate by 180 degrees by passing a negative height. # Cropping - Vertical Flip When cropping from a subsampled format like NV21, the method of setting the start pointers wont work for odd crop start y on the UV plane. If the height after cropping will be odd, invert the source - point to the last row, negate the strides, and pass negative height, which will re-invert the image as the conversion outputs. libyuv-0.0~git20220104.b91df1a/download_vs_toolchain.py000066400000000000000000000017771416500237200224730ustar00rootroot00000000000000#!/usr/bin/env python # # Copyright 2014 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. # This script is used to run the vs_toolchain.py script to download the # Visual Studio toolchain. It's just a temporary measure while waiting for the # Chrome team to move find_depot_tools into src/build to get rid of these # workarounds (similar one in gyp_libyuv). import os import sys checkout_root = os.path.dirname(os.path.realpath(__file__)) sys.path.insert(0, os.path.join(checkout_root, 'build')) sys.path.insert(0, os.path.join(checkout_root, 'tools', 'find_depot_tools')) import vs_toolchain # pylint: disable=wrong-import-position if __name__ == '__main__': sys.exit(vs_toolchain.main()) libyuv-0.0~git20220104.b91df1a/include/000077500000000000000000000000001416500237200171515ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/include/libyuv.h000066400000000000000000000020221416500237200206300ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_H_ #define INCLUDE_LIBYUV_H_ #include "libyuv/basic_types.h" #include "libyuv/compare.h" #include "libyuv/convert.h" #include "libyuv/convert_argb.h" #include "libyuv/convert_from.h" #include "libyuv/convert_from_argb.h" #include "libyuv/cpu_id.h" #include "libyuv/mjpeg_decoder.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" #include "libyuv/rotate_argb.h" #include "libyuv/row.h" #include "libyuv/scale.h" #include "libyuv/scale_argb.h" #include "libyuv/scale_row.h" #include "libyuv/scale_uv.h" #include "libyuv/version.h" #include "libyuv/video_common.h" #endif // INCLUDE_LIBYUV_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/000077500000000000000000000000001416500237200204635ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/include/libyuv/basic_types.h000066400000000000000000000041561416500237200231470ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_ #define INCLUDE_LIBYUV_BASIC_TYPES_H_ #include // For size_t and NULL #if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG) #define INT_TYPES_DEFINED #if defined(_MSC_VER) && (_MSC_VER < 1600) #include // for uintptr_t on x86 typedef unsigned __int64 uint64_t; typedef __int64 int64_t; typedef unsigned int uint32_t; typedef int int32_t; typedef unsigned short uint16_t; typedef short int16_t; typedef unsigned char uint8_t; typedef signed char int8_t; #else #include // for uintptr_t and C99 types #endif // defined(_MSC_VER) && (_MSC_VER < 1600) // Types are deprecated. Enable this macro for legacy types. #ifdef LIBYUV_LEGACY_TYPES typedef uint64_t uint64; typedef int64_t int64; typedef uint32_t uint32; typedef int32_t int32; typedef uint16_t uint16; typedef int16_t int16; typedef uint8_t uint8; typedef int8_t int8; #endif // LIBYUV_LEGACY_TYPES #endif // INT_TYPES_DEFINED #if !defined(LIBYUV_API) #if defined(_WIN32) || defined(__CYGWIN__) #if defined(LIBYUV_BUILDING_SHARED_LIBRARY) #define LIBYUV_API __declspec(dllexport) #elif defined(LIBYUV_USING_SHARED_LIBRARY) #define LIBYUV_API __declspec(dllimport) #else #define LIBYUV_API #endif // LIBYUV_BUILDING_SHARED_LIBRARY #elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \ (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \ defined(LIBYUV_USING_SHARED_LIBRARY)) #define LIBYUV_API __attribute__((visibility("default"))) #else #define LIBYUV_API #endif // __GNUC__ #endif // LIBYUV_API // TODO(fbarchard): Remove bool macros. #define LIBYUV_BOOL int #define LIBYUV_FALSE 0 #define LIBYUV_TRUE 1 #endif // INCLUDE_LIBYUV_BASIC_TYPES_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/compare.h000066400000000000000000000065171416500237200222730ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_COMPARE_H_ #define INCLUDE_LIBYUV_COMPARE_H_ #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Compute a hash for specified memory. Seed of 5381 recommended. LIBYUV_API uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed); // Hamming Distance LIBYUV_API uint64_t ComputeHammingDistance(const uint8_t* src_a, const uint8_t* src_b, int count); // Scan an opaque argb image and return fourcc based on alpha offset. // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. LIBYUV_API uint32_t ARGBDetect(const uint8_t* argb, int stride_argb, int width, int height); // Sum Square Error - used to compute Mean Square Error or PSNR. LIBYUV_API uint64_t ComputeSumSquareError(const uint8_t* src_a, const uint8_t* src_b, int count); LIBYUV_API uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, int stride_a, const uint8_t* src_b, int stride_b, int width, int height); static const int kMaxPsnr = 128; LIBYUV_API double SumSquareErrorToPsnr(uint64_t sse, uint64_t count); LIBYUV_API double CalcFramePsnr(const uint8_t* src_a, int stride_a, const uint8_t* src_b, int stride_b, int width, int height); LIBYUV_API double I420Psnr(const uint8_t* src_y_a, int stride_y_a, const uint8_t* src_u_a, int stride_u_a, const uint8_t* src_v_a, int stride_v_a, const uint8_t* src_y_b, int stride_y_b, const uint8_t* src_u_b, int stride_u_b, const uint8_t* src_v_b, int stride_v_b, int width, int height); LIBYUV_API double CalcFrameSsim(const uint8_t* src_a, int stride_a, const uint8_t* src_b, int stride_b, int width, int height); LIBYUV_API double I420Ssim(const uint8_t* src_y_a, int stride_y_a, const uint8_t* src_u_a, int stride_u_a, const uint8_t* src_v_a, int stride_v_a, const uint8_t* src_y_b, int stride_y_b, const uint8_t* src_u_b, int stride_u_b, const uint8_t* src_v_b, int stride_v_b, int width, int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_COMPARE_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/compare_row.h000066400000000000000000000120311416500237200231460ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_COMPARE_ROW_H_ #define INCLUDE_LIBYUV_COMPARE_ROW_H_ #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__native_client__) && defined(__x86_64__)) || \ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif #if defined(__native_client__) #define LIBYUV_DISABLE_NEON #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) #define LIBYUV_DISABLE_X86 #endif #endif // Visual C 2012 required for AVX2. #if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 // clang >= 3.4.0 required for AVX2. #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) #if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) #define CLANG_HAS_AVX2 1 #endif // clang >= 3.4 #endif // __clang__ // The following are available for Visual C and GCC: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || defined(__i386__) || defined(_M_IX86)) #define HAS_HASHDJB2_SSE41 #define HAS_SUMSQUAREERROR_SSE2 #define HAS_HAMMINGDISTANCE_SSE42 #endif // The following are available for Visual C and clangcl 32 bit: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ !defined(__clang__) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2)) #define HAS_HASHDJB2_AVX2 #define HAS_SUMSQUAREERROR_AVX2 #endif // The following are available for GCC and clangcl: #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #define HAS_HAMMINGDISTANCE_SSSE3 #endif // The following are available for GCC and clangcl: #if !defined(LIBYUV_DISABLE_X86) && defined(CLANG_HAS_AVX2) && \ (defined(__x86_64__) || defined(__i386__)) #define HAS_HAMMINGDISTANCE_AVX2 #endif // The following are available for Neon: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SUMSQUAREERROR_NEON #define HAS_HAMMINGDISTANCE_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_HAMMINGDISTANCE_MSA #define HAS_SUMSQUAREERROR_MSA #endif #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #define HAS_HAMMINGDISTANCE_MMI #define HAS_SUMSQUAREERROR_MMI #endif uint32_t HammingDistance_C(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t HammingDistance_SSE42(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t HammingDistance_SSSE3(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t HammingDistance_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t HammingDistance_NEON(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t HammingDistance_MSA(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t HammingDistance_MMI(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t SumSquareError_C(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t SumSquareError_MSA(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t SumSquareError_MMI(const uint8_t* src_a, const uint8_t* src_b, int count); uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed); uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed); uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_COMPARE_ROW_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/convert.h000066400000000000000000000617161416500237200223270ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_CONVERT_H_ #define INCLUDE_LIBYUV_CONVERT_H_ #include "libyuv/basic_types.h" #include "libyuv/rotate.h" // For enum RotationMode. // TODO(fbarchard): fix WebRTC source to include following libyuv headers: #include "libyuv/convert_argb.h" // For WebRTC I420ToARGB. b/620 #include "libyuv/convert_from.h" // For WebRTC ConvertFromI420. b/620 #include "libyuv/planar_functions.h" // For WebRTC I420Rect, CopyPlane. b/618 #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Convert I444 to I420. LIBYUV_API int I444ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert I444 to NV12. LIBYUV_API int I444ToNV12(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Convert I444 to NV21. LIBYUV_API int I444ToNV21(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height); // Convert I422 to I420. LIBYUV_API int I422ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert I422 to I444. LIBYUV_API int I422ToI444(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert I422 to NV21. LIBYUV_API int I422ToNV21(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height); // Copy I420 to I420. #define I420ToI420 I420Copy LIBYUV_API int I420Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert I420 to I444. LIBYUV_API int I420ToI444(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Copy I010 to I010 #define I010ToI010 I010Copy #define H010ToH010 I010Copy LIBYUV_API int I010Copy(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height); // Convert 10 bit YUV to 8 bit #define H010ToH420 I010ToI420 LIBYUV_API int I010ToI420(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); #define H210ToH422 I210ToI422 LIBYUV_API int I210ToI422(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); #define H410ToH444 I410ToI444 LIBYUV_API int I410ToI444(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); #define H012ToH420 I012ToI420 LIBYUV_API int I012ToI420(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); #define H212ToH422 I212ToI422 LIBYUV_API int I212ToI422(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); #define H412ToH444 I412ToI444 LIBYUV_API int I412ToI444(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); #define I412ToI012 I410ToI010 #define H410ToH010 I410ToI010 #define H412ToH012 I410ToI010 LIBYUV_API int I410ToI010(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height); #define I212ToI012 I210ToI010 #define H210ToH010 I210ToI010 #define H212ToH012 I210ToI010 LIBYUV_API int I210ToI010(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height); // Convert I010 to I410 LIBYUV_API int I010ToI410(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height); // Convert I012 to I412 #define I012ToI412 I010ToI410 // Convert I210 to I410 LIBYUV_API int I210ToI410(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height); // Convert I212 to I412 #define I212ToI412 I210ToI410 // Convert I010 to P010 LIBYUV_API int I010ToP010(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height); // Convert I210 to P210 LIBYUV_API int I210ToP210(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height); // Convert I012 to P012 LIBYUV_API int I012ToP012(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height); // Convert I212 to P212 LIBYUV_API int I212ToP212(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height); // Convert I400 (grey) to I420. LIBYUV_API int I400ToI420(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert I400 (grey) to NV21. LIBYUV_API int I400ToNV21(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height); #define J400ToJ420 I400ToI420 // Convert NV12 to I420. LIBYUV_API int NV12ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert NV21 to I420. LIBYUV_API int NV21ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert NV12 to NV24. LIBYUV_API int NV12ToNV24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Convert NV16 to NV24. LIBYUV_API int NV16ToNV24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Convert P010 to P410. LIBYUV_API int P010ToP410(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height); // Convert P012 to P412. #define P012ToP412 P010ToP410 // Convert P016 to P416. #define P016ToP416 P010ToP410 // Convert P210 to P410. LIBYUV_API int P210ToP410(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height); // Convert P212 to P412. #define P212ToP412 P210ToP410 // Convert P216 to P416. #define P216ToP416 P210ToP410 // Convert YUY2 to I420. LIBYUV_API int YUY2ToI420(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert UYVY to I420. LIBYUV_API int UYVYToI420(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert AYUV to NV12. LIBYUV_API int AYUVToNV12(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Convert AYUV to NV21. LIBYUV_API int AYUVToNV21(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height); // Convert Android420 to I420. LIBYUV_API int Android420ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // ARGB little endian (bgra in memory) to I420. LIBYUV_API int ARGBToI420(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // BGRA little endian (argb in memory) to I420. LIBYUV_API int BGRAToI420(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // ABGR little endian (rgba in memory) to I420. LIBYUV_API int ABGRToI420(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // RGBA little endian (abgr in memory) to I420. LIBYUV_API int RGBAToI420(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // RGB little endian (bgr in memory) to I420. LIBYUV_API int RGB24ToI420(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // RGB little endian (bgr in memory) to J420. LIBYUV_API int RGB24ToJ420(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // RGB big endian (rgb in memory) to I420. LIBYUV_API int RAWToI420(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // RGB big endian (rgb in memory) to J420. LIBYUV_API int RAWToJ420(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // RGB16 (RGBP fourcc) little endian to I420. LIBYUV_API int RGB565ToI420(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // RGB15 (RGBO fourcc) little endian to I420. LIBYUV_API int ARGB1555ToI420(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // RGB12 (R444 fourcc) little endian to I420. LIBYUV_API int ARGB4444ToI420(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // RGB little endian (bgr in memory) to J400. LIBYUV_API int RGB24ToJ400(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_yj, int dst_stride_yj, int width, int height); // RGB big endian (rgb in memory) to J400. LIBYUV_API int RAWToJ400(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_yj, int dst_stride_yj, int width, int height); // src_width/height provided by capture. // dst_width/height for clipping determine final size. LIBYUV_API int MJPGToI420(const uint8_t* sample, size_t sample_size, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int src_width, int src_height, int dst_width, int dst_height); // JPEG to NV21 LIBYUV_API int MJPGToNV21(const uint8_t* sample, size_t sample_size, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int src_width, int src_height, int dst_width, int dst_height); // JPEG to NV12 LIBYUV_API int MJPGToNV12(const uint8_t* sample, size_t sample_size, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int src_width, int src_height, int dst_width, int dst_height); // Query size of MJPG in pixels. LIBYUV_API int MJPGSize(const uint8_t* sample, size_t sample_size, int* width, int* height); // Convert camera sample to I420 with cropping, rotation and vertical flip. // "src_size" is needed to parse MJPG. // "dst_stride_y" number of bytes in a row of the dst_y plane. // Normally this would be the same as dst_width, with recommended alignment // to 16 bytes for better efficiency. // If rotation of 90 or 270 is used, stride is affected. The caller should // allocate the I420 buffer according to rotation. // "dst_stride_u" number of bytes in a row of the dst_u plane. // Normally this would be the same as (dst_width + 1) / 2, with // recommended alignment to 16 bytes for better efficiency. // If rotation of 90 or 270 is used, stride is affected. // "crop_x" and "crop_y" are starting position for cropping. // To center, crop_x = (src_width - dst_width) / 2 // crop_y = (src_height - dst_height) / 2 // "src_width" / "src_height" is size of src_frame in pixels. // "src_height" can be negative indicating a vertically flipped image source. // "crop_width" / "crop_height" is the size to crop the src to. // Must be less than or equal to src_width/src_height // Cropping parameters are pre-rotation. // "rotation" can be 0, 90, 180 or 270. // "fourcc" is a fourcc. ie 'I420', 'YUY2' // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. LIBYUV_API int ConvertToI420(const uint8_t* sample, size_t sample_size, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int crop_x, int crop_y, int src_width, int src_height, int crop_width, int crop_height, enum RotationMode rotation, uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_CONVERT_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/convert_argb.h000066400000000000000000001752311416500237200233200ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ #define INCLUDE_LIBYUV_CONVERT_ARGB_H_ #include "libyuv/basic_types.h" #include "libyuv/rotate.h" // For enum RotationMode. #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Conversion matrix for YUV to RGB LIBYUV_API extern const struct YuvConstants kYuvI601Constants; // BT.601 LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants; // BT.601 full LIBYUV_API extern const struct YuvConstants kYuvH709Constants; // BT.709 LIBYUV_API extern const struct YuvConstants kYuvF709Constants; // BT.709 full LIBYUV_API extern const struct YuvConstants kYuv2020Constants; // BT.2020 LIBYUV_API extern const struct YuvConstants kYuvV2020Constants; // BT.2020 full // Conversion matrix for YVU to BGR LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601 LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // BT.601 full LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709 LIBYUV_API extern const struct YuvConstants kYvuF709Constants; // BT.709 full LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020 LIBYUV_API extern const struct YuvConstants kYvuV2020Constants; // BT.2020 full // Macros for end swapped destination Matrix conversions. // Swap UV and pass mirrored kYvuJPEGConstants matrix. // TODO(fbarchard): Add macro for each Matrix function. #define kYuvI601ConstantsVU kYvuI601Constants #define kYuvJPEGConstantsVU kYvuJPEGConstants #define kYuvH709ConstantsVU kYvuH709Constants #define kYuvF709ConstantsVU kYvuF709Constants #define kYuv2020ConstantsVU kYvu2020Constants #define kYuvV2020ConstantsVU kYvuV2020Constants #define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \ NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i) #define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \ NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i) #define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \ NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) #define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \ NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) #define I010ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ I010ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I210ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ I210ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I410ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ I410ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I010ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ I010ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I210ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) #define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I422AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) #define I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I444AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) #define I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I010AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) #define I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I210AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) #define I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I410AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) // Alias. #define ARGBToARGB ARGBCopy // Copy ARGB to ARGB. LIBYUV_API int ARGBCopy(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert I420 to ARGB. LIBYUV_API int I420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert I420 to ABGR. LIBYUV_API int I420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert J420 to ARGB. LIBYUV_API int J420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert J420 to ABGR. LIBYUV_API int J420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert H420 to ARGB. LIBYUV_API int H420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert H420 to ABGR. LIBYUV_API int H420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert U420 to ARGB. LIBYUV_API int U420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert U420 to ABGR. LIBYUV_API int U420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert I422 to ARGB. LIBYUV_API int I422ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert I422 to ABGR. LIBYUV_API int I422ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert J422 to ARGB. LIBYUV_API int J422ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert J422 to ABGR. LIBYUV_API int J422ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert H422 to ARGB. LIBYUV_API int H422ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert H422 to ABGR. LIBYUV_API int H422ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert U422 to ARGB. LIBYUV_API int U422ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert U422 to ABGR. LIBYUV_API int U422ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert I444 to ARGB. LIBYUV_API int I444ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert I444 to ABGR. LIBYUV_API int I444ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert J444 to ARGB. LIBYUV_API int J444ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert J444 to ABGR. LIBYUV_API int J444ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert H444 to ARGB. LIBYUV_API int H444ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert H444 to ABGR. LIBYUV_API int H444ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert U444 to ARGB. LIBYUV_API int U444ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert U444 to ABGR. LIBYUV_API int U444ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert I010 to ARGB. LIBYUV_API int I010ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert I010 to ABGR. LIBYUV_API int I010ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert H010 to ARGB. LIBYUV_API int H010ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert H010 to ABGR. LIBYUV_API int H010ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert U010 to ARGB. LIBYUV_API int U010ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert U010 to ABGR. LIBYUV_API int U010ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert I210 to ARGB. LIBYUV_API int I210ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert I210 to ABGR. LIBYUV_API int I210ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert H210 to ARGB. LIBYUV_API int H210ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert H210 to ABGR. LIBYUV_API int H210ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert U210 to ARGB. LIBYUV_API int U210ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert U210 to ABGR. LIBYUV_API int U210ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert I420 with Alpha to preattenuated ARGB. LIBYUV_API int I420AlphaToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int attenuate); // Convert I420 with Alpha to preattenuated ABGR. LIBYUV_API int I420AlphaToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height, int attenuate); // Convert I422 with Alpha to preattenuated ARGB. LIBYUV_API int I422AlphaToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int attenuate); // Convert I422 with Alpha to preattenuated ABGR. LIBYUV_API int I422AlphaToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height, int attenuate); // Convert I444 with Alpha to preattenuated ARGB. LIBYUV_API int I444AlphaToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int attenuate); // Convert I444 with Alpha to preattenuated ABGR. LIBYUV_API int I444AlphaToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height, int attenuate); // Convert I400 (grey) to ARGB. Reverse of ARGBToI400. LIBYUV_API int I400ToARGB(const uint8_t* src_y, int src_stride_y, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert J400 (jpeg grey) to ARGB. LIBYUV_API int J400ToARGB(const uint8_t* src_y, int src_stride_y, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Alias. #define YToARGB I400ToARGB // Convert NV12 to ARGB. LIBYUV_API int NV12ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert NV21 to ARGB. LIBYUV_API int NV21ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert NV12 to ABGR. LIBYUV_API int NV12ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert NV21 to ABGR. LIBYUV_API int NV21ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert NV12 to RGB24. LIBYUV_API int NV12ToRGB24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height); // Convert NV21 to RGB24. LIBYUV_API int NV21ToRGB24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height); // Convert NV21 to YUV24. LIBYUV_API int NV21ToYUV24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_yuv24, int dst_stride_yuv24, int width, int height); // Convert NV12 to RAW. LIBYUV_API int NV12ToRAW(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_raw, int dst_stride_raw, int width, int height); // Convert NV21 to RAW. LIBYUV_API int NV21ToRAW(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_raw, int dst_stride_raw, int width, int height); // Convert YUY2 to ARGB. LIBYUV_API int YUY2ToARGB(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert UYVY to ARGB. LIBYUV_API int UYVYToARGB(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert I010 to AR30. LIBYUV_API int I010ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height); // Convert H010 to AR30. LIBYUV_API int H010ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height); // Convert I010 to AB30. LIBYUV_API int I010ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height); // Convert H010 to AB30. LIBYUV_API int H010ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height); // Convert U010 to AR30. LIBYUV_API int U010ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height); // Convert U010 to AB30. LIBYUV_API int U010ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height); // Convert I210 to AR30. LIBYUV_API int I210ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height); // Convert I210 to AB30. LIBYUV_API int I210ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height); // Convert H210 to AR30. LIBYUV_API int H210ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height); // Convert H210 to AB30. LIBYUV_API int H210ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height); // Convert U210 to AR30. LIBYUV_API int U210ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height); // Convert U210 to AB30. LIBYUV_API int U210ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height); // BGRA little endian (argb in memory) to ARGB. LIBYUV_API int BGRAToARGB(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // ABGR little endian (rgba in memory) to ARGB. LIBYUV_API int ABGRToARGB(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // RGBA little endian (abgr in memory) to ARGB. LIBYUV_API int RGBAToARGB(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Deprecated function name. #define BG24ToARGB RGB24ToARGB // RGB little endian (bgr in memory) to ARGB. LIBYUV_API int RGB24ToARGB(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // RGB big endian (rgb in memory) to ARGB. LIBYUV_API int RAWToARGB(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // RGB big endian (rgb in memory) to RGBA. LIBYUV_API int RAWToRGBA(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_rgba, int dst_stride_rgba, int width, int height); // RGB16 (RGBP fourcc) little endian to ARGB. LIBYUV_API int RGB565ToARGB(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // RGB15 (RGBO fourcc) little endian to ARGB. LIBYUV_API int ARGB1555ToARGB(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // RGB12 (R444 fourcc) little endian to ARGB. LIBYUV_API int ARGB4444ToARGB(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Aliases #define AB30ToARGB AR30ToABGR #define AB30ToABGR AR30ToARGB #define AB30ToAR30 AR30ToAB30 // Convert AR30 To ARGB. LIBYUV_API int AR30ToARGB(const uint8_t* src_ar30, int src_stride_ar30, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert AR30 To ABGR. LIBYUV_API int AR30ToABGR(const uint8_t* src_ar30, int src_stride_ar30, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert AR30 To AB30. LIBYUV_API int AR30ToAB30(const uint8_t* src_ar30, int src_stride_ar30, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height); // Convert AR64 to ARGB. LIBYUV_API int AR64ToARGB(const uint16_t* src_ar64, int src_stride_ar64, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert AB64 to ABGR. #define AB64ToABGR AR64ToARGB // Convert AB64 to ARGB. LIBYUV_API int AB64ToARGB(const uint16_t* src_ab64, int src_stride_ab64, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert AR64 to ABGR. #define AR64ToABGR AB64ToARGB // Convert AR64 To AB64. LIBYUV_API int AR64ToAB64(const uint16_t* src_ar64, int src_stride_ar64, uint16_t* dst_ab64, int dst_stride_ab64, int width, int height); // Convert AB64 To AR64. #define AB64ToAR64 AR64ToAB64 // src_width/height provided by capture // dst_width/height for clipping determine final size. LIBYUV_API int MJPGToARGB(const uint8_t* sample, size_t sample_size, uint8_t* dst_argb, int dst_stride_argb, int src_width, int src_height, int dst_width, int dst_height); // Convert Android420 to ARGB. LIBYUV_API int Android420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert Android420 to ABGR. LIBYUV_API int Android420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert NV12 to RGB565. LIBYUV_API int NV12ToRGB565(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height); // Convert I422 to BGRA. LIBYUV_API int I422ToBGRA(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_bgra, int dst_stride_bgra, int width, int height); // Convert I422 to ABGR. LIBYUV_API int I422ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert I422 to RGBA. LIBYUV_API int I422ToRGBA(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgba, int dst_stride_rgba, int width, int height); LIBYUV_API int I420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); LIBYUV_API int I420ToBGRA(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_bgra, int dst_stride_bgra, int width, int height); LIBYUV_API int I420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); LIBYUV_API int I420ToRGBA(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgba, int dst_stride_rgba, int width, int height); LIBYUV_API int I420ToRGB24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height); LIBYUV_API int I420ToRAW(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_raw, int dst_stride_raw, int width, int height); LIBYUV_API int H420ToRGB24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height); LIBYUV_API int H420ToRAW(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_raw, int dst_stride_raw, int width, int height); LIBYUV_API int J420ToRGB24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height); LIBYUV_API int J420ToRAW(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_raw, int dst_stride_raw, int width, int height); LIBYUV_API int I420ToRGB565(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height); LIBYUV_API int J420ToRGB565(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height); LIBYUV_API int H420ToRGB565(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height); LIBYUV_API int I422ToRGB565(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height); // Convert I420 To RGB565 with 4x4 dither matrix (16 bytes). // Values in dither matrix from 0 to 7 recommended. // The order of the dither matrix is first byte is upper left. LIBYUV_API int I420ToRGB565Dither(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, const uint8_t* dither4x4, int width, int height); LIBYUV_API int I420ToARGB1555(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb1555, int dst_stride_argb1555, int width, int height); LIBYUV_API int I420ToARGB4444(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb4444, int dst_stride_argb4444, int width, int height); // Convert I420 to AR30. LIBYUV_API int I420ToAR30(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height); // Convert I420 to AB30. LIBYUV_API int I420ToAB30(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height); // Convert H420 to AR30. LIBYUV_API int H420ToAR30(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height); // Convert H420 to AB30. LIBYUV_API int H420ToAB30(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height); // Convert I420 to ARGB with matrix. LIBYUV_API int I420ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert I422 to ARGB with matrix. LIBYUV_API int I422ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert I444 to ARGB with matrix. LIBYUV_API int I444ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert 10 bit 420 YUV to ARGB with matrix. LIBYUV_API int I010ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height); // Convert 10 bit 420 YUV to ARGB with matrix. LIBYUV_API int I210ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height); // Convert 10 bit 444 YUV to ARGB with matrix. LIBYUV_API int I410ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height); // Convert 10 bit YUV to ARGB with matrix. LIBYUV_API int I010ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // multiply 12 bit yuv into high bits to allow any number of bits. LIBYUV_API int I012ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height); // Convert 12 bit YUV to ARGB with matrix. LIBYUV_API int I012ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert 10 bit 422 YUV to ARGB with matrix. LIBYUV_API int I210ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert 10 bit 444 YUV to ARGB with matrix. LIBYUV_API int I410ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert P010 to ARGB with matrix. LIBYUV_API int P010ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert P210 to ARGB with matrix. LIBYUV_API int P210ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert P010 to AR30 with matrix. LIBYUV_API int P010ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height); // Convert P210 to AR30 with matrix. LIBYUV_API int P210ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height); // P012 and P010 use most significant bits so the conversion is the same. // Convert P012 to ARGB with matrix. #define P012ToARGBMatrix P010ToARGBMatrix // Convert P012 to AR30 with matrix. #define P012ToAR30Matrix P010ToAR30Matrix // Convert P212 to ARGB with matrix. #define P212ToARGBMatrix P210ToARGBMatrix // Convert P212 to AR30 with matrix. #define P212ToAR30Matrix P210ToAR30Matrix // Convert P016 to ARGB with matrix. #define P016ToARGBMatrix P010ToARGBMatrix // Convert P016 to AR30 with matrix. #define P016ToAR30Matrix P010ToAR30Matrix // Convert P216 to ARGB with matrix. #define P216ToARGBMatrix P210ToARGBMatrix // Convert P216 to AR30 with matrix. #define P216ToAR30Matrix P210ToAR30Matrix // Convert I420 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I420AlphaToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate); // Convert I422 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I422AlphaToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate); // Convert I444 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I444AlphaToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate); // Convert I010 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I010AlphaToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, const uint16_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate); // Convert I210 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I210AlphaToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, const uint16_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate); // Convert I410 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I410AlphaToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, const uint16_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate); // Convert NV12 to ARGB with matrix. LIBYUV_API int NV12ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert NV21 to ARGB with matrix. LIBYUV_API int NV21ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert NV12 to RGB565 with matrix. LIBYUV_API int NV12ToRGB565Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_rgb565, int dst_stride_rgb565, const struct YuvConstants* yuvconstants, int width, int height); // Convert NV12 to RGB24 with matrix. LIBYUV_API int NV12ToRGB24Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_rgb24, int dst_stride_rgb24, const struct YuvConstants* yuvconstants, int width, int height); // Convert NV21 to RGB24 with matrix. LIBYUV_API int NV21ToRGB24Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_rgb24, int dst_stride_rgb24, const struct YuvConstants* yuvconstants, int width, int height); // Convert Android420 to ARGB with matrix. LIBYUV_API int Android420ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert I422 to RGBA with matrix. LIBYUV_API int I422ToRGBAMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgba, int dst_stride_rgba, const struct YuvConstants* yuvconstants, int width, int height); // Convert I422 to RGBA with matrix. LIBYUV_API int I420ToRGBAMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgba, int dst_stride_rgba, const struct YuvConstants* yuvconstants, int width, int height); // Convert I420 to RGB24 with matrix. LIBYUV_API int I420ToRGB24Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb24, int dst_stride_rgb24, const struct YuvConstants* yuvconstants, int width, int height); // Convert I420 to RGB565 with specified color matrix. LIBYUV_API int I420ToRGB565Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, const struct YuvConstants* yuvconstants, int width, int height); // Convert I420 to AR30 with matrix. LIBYUV_API int I420ToAR30Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height); // Convert I400 (grey) to ARGB. Reverse of ARGBToI400. LIBYUV_API int I400ToARGBMatrix(const uint8_t* src_y, int src_stride_y, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height); // Convert camera sample to ARGB with cropping, rotation and vertical flip. // "sample_size" is needed to parse MJPG. // "dst_stride_argb" number of bytes in a row of the dst_argb plane. // Normally this would be the same as dst_width, with recommended alignment // to 16 bytes for better efficiency. // If rotation of 90 or 270 is used, stride is affected. The caller should // allocate the I420 buffer according to rotation. // "dst_stride_u" number of bytes in a row of the dst_u plane. // Normally this would be the same as (dst_width + 1) / 2, with // recommended alignment to 16 bytes for better efficiency. // If rotation of 90 or 270 is used, stride is affected. // "crop_x" and "crop_y" are starting position for cropping. // To center, crop_x = (src_width - dst_width) / 2 // crop_y = (src_height - dst_height) / 2 // "src_width" / "src_height" is size of src_frame in pixels. // "src_height" can be negative indicating a vertically flipped image source. // "crop_width" / "crop_height" is the size to crop the src to. // Must be less than or equal to src_width/src_height // Cropping parameters are pre-rotation. // "rotation" can be 0, 90, 180 or 270. // "fourcc" is a fourcc. ie 'I420', 'YUY2' // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure. LIBYUV_API int ConvertToARGB(const uint8_t* sample, size_t sample_size, uint8_t* dst_argb, int dst_stride_argb, int crop_x, int crop_y, int src_width, int src_height, int crop_width, int crop_height, enum RotationMode rotation, uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/convert_from.h000066400000000000000000000135671416500237200233530ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ #define INCLUDE_LIBYUV_CONVERT_FROM_H_ #include "libyuv/basic_types.h" #include "libyuv/rotate.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // See Also convert.h for conversions from formats to I420. // Convert 8 bit YUV to 10 bit. #define H420ToH010 I420ToI010 LIBYUV_API int I420ToI010(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height); // Convert 8 bit YUV to 12 bit. #define H420ToH012 I420ToI012 LIBYUV_API int I420ToI012(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height); LIBYUV_API int I420ToI422(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); LIBYUV_API int I420ToI444(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21. LIBYUV_API int I400Copy(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int width, int height); LIBYUV_API int I420ToNV12(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); LIBYUV_API int I420ToNV21(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height); LIBYUV_API int I420ToYUY2(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_yuy2, int dst_stride_yuy2, int width, int height); LIBYUV_API int I420ToUYVY(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uyvy, int dst_stride_uyvy, int width, int height); // The following are from convert_argb.h // DEPRECATED: The prototypes will be removed in future. Use convert_argb.h // Convert I420 to ARGB. LIBYUV_API int I420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert I420 to ABGR. LIBYUV_API int I420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert I420 to specified format. // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the // buffer has contiguous rows. Can be negative. A multiple of 16 is optimal. LIBYUV_API int ConvertFromI420(const uint8_t* y, int y_stride, const uint8_t* u, int u_stride, const uint8_t* v, int v_stride, uint8_t* dst_sample, int dst_sample_stride, int width, int height, uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/convert_from_argb.h000066400000000000000000000220271416500237200243350ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ #define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Copy ARGB to ARGB. #define ARGBToARGB ARGBCopy LIBYUV_API int ARGBCopy(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert ARGB To BGRA. LIBYUV_API int ARGBToBGRA(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_bgra, int dst_stride_bgra, int width, int height); // Convert ARGB To ABGR. LIBYUV_API int ARGBToABGR(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height); // Convert ARGB To RGBA. LIBYUV_API int ARGBToRGBA(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_rgba, int dst_stride_rgba, int width, int height); // Aliases #define ARGBToAB30 ABGRToAR30 #define ABGRToAB30 ARGBToAR30 // Convert ABGR To AR30. LIBYUV_API int ABGRToAR30(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height); // Convert ARGB To AR30. LIBYUV_API int ARGBToAR30(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height); // Aliases #define ABGRToRGB24 ARGBToRAW #define ABGRToRAW ARGBToRGB24 // Convert ARGB To RGB24. LIBYUV_API int ARGBToRGB24(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height); // Convert ARGB To RAW. LIBYUV_API int ARGBToRAW(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_raw, int dst_stride_raw, int width, int height); // Convert ARGB To RGB565. LIBYUV_API int ARGBToRGB565(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height); // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). // Values in dither matrix from 0 to 7 recommended. // The order of the dither matrix is first byte is upper left. // TODO(fbarchard): Consider pointer to 2d array for dither4x4. // const uint8_t(*dither)[4][4]; LIBYUV_API int ARGBToRGB565Dither(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_rgb565, int dst_stride_rgb565, const uint8_t* dither4x4, int width, int height); // Convert ARGB To ARGB1555. LIBYUV_API int ARGBToARGB1555(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb1555, int dst_stride_argb1555, int width, int height); // Convert ARGB To ARGB4444. LIBYUV_API int ARGBToARGB4444(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb4444, int dst_stride_argb4444, int width, int height); // Convert ARGB To I444. LIBYUV_API int ARGBToI444(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert ARGB to AR64. LIBYUV_API int ARGBToAR64(const uint8_t* src_argb, int src_stride_argb, uint16_t* dst_ar64, int dst_stride_ar64, int width, int height); // Convert ABGR to AB64. #define ABGRToAB64 ARGBToAR64 // Convert ARGB to AB64. LIBYUV_API int ARGBToAB64(const uint8_t* src_argb, int src_stride_argb, uint16_t* dst_ab64, int dst_stride_ab64, int width, int height); // Convert ABGR to AR64. #define ABGRToAR64 ARGBToAB64 // Convert ARGB To I422. LIBYUV_API int ARGBToI422(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert ARGB To I420. (also in convert.h) LIBYUV_API int ARGBToI420(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert ARGB to J420. (JPeg full range I420). LIBYUV_API int ARGBToJ420(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert ARGB to J422. LIBYUV_API int ARGBToJ422(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert ARGB to J400. (JPeg full range). LIBYUV_API int ARGBToJ400(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, int width, int height); // Convert RGBA to J400. (JPeg full range). LIBYUV_API int RGBAToJ400(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_yj, int dst_stride_yj, int width, int height); // Convert ARGB to I400. LIBYUV_API int ARGBToI400(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, int width, int height); // Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB) LIBYUV_API int ARGBToG(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_g, int dst_stride_g, int width, int height); // Convert ARGB To NV12. LIBYUV_API int ARGBToNV12(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Convert ARGB To NV21. LIBYUV_API int ARGBToNV21(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height); // Convert ABGR To NV12. LIBYUV_API int ABGRToNV12(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Convert ABGR To NV21. LIBYUV_API int ABGRToNV21(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height); // Convert ARGB To YUY2. LIBYUV_API int ARGBToYUY2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yuy2, int dst_stride_yuy2, int width, int height); // Convert ARGB To UYVY. LIBYUV_API int ARGBToUYVY(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_uyvy, int dst_stride_uyvy, int width, int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/cpu_id.h000066400000000000000000000103051416500237200220760ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_CPU_ID_H_ #define INCLUDE_LIBYUV_CPU_ID_H_ #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Internal flag to indicate cpuid requires initialization. static const int kCpuInitialized = 0x1; // These flags are only valid on ARM processors. static const int kCpuHasARM = 0x2; static const int kCpuHasNEON = 0x4; // 0x8 reserved for future ARM flag. // These flags are only valid on x86 processors. static const int kCpuHasX86 = 0x10; static const int kCpuHasSSE2 = 0x20; static const int kCpuHasSSSE3 = 0x40; static const int kCpuHasSSE41 = 0x80; static const int kCpuHasSSE42 = 0x100; // unused at this time. static const int kCpuHasAVX = 0x200; static const int kCpuHasAVX2 = 0x400; static const int kCpuHasERMS = 0x800; static const int kCpuHasFMA3 = 0x1000; static const int kCpuHasF16C = 0x2000; static const int kCpuHasGFNI = 0x4000; static const int kCpuHasAVX512BW = 0x8000; static const int kCpuHasAVX512VL = 0x10000; static const int kCpuHasAVX512VBMI = 0x20000; static const int kCpuHasAVX512VBMI2 = 0x40000; static const int kCpuHasAVX512VBITALG = 0x80000; static const int kCpuHasAVX512VPOPCNTDQ = 0x100000; // These flags are only valid on MIPS processors. static const int kCpuHasMIPS = 0x200000; static const int kCpuHasMSA = 0x400000; static const int kCpuHasMMI = 0x800000; // Optional init function. TestCpuFlag does an auto-init. // Returns cpu_info flags. LIBYUV_API int InitCpuFlags(void); // Detect CPU has SSE2 etc. // Test_flag parameter should be one of kCpuHas constants above. // Returns non-zero if instruction set is detected static __inline int TestCpuFlag(int test_flag) { LIBYUV_API extern int cpu_info_; #ifdef __ATOMIC_RELAXED int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED); #else int cpu_info = cpu_info_; #endif return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag; } // Internal function for parsing /proc/cpuinfo. LIBYUV_API int ArmCpuCaps(const char* cpuinfo_name); LIBYUV_API int MipsCpuCaps(const char* cpuinfo_name); // For testing, allow CPU flags to be disabled. // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. // MaskCpuFlags(-1) to enable all cpu specific optimizations. // MaskCpuFlags(1) to disable all cpu specific optimizations. // MaskCpuFlags(0) to reset state so next call will auto init. // Returns cpu_info flags. LIBYUV_API int MaskCpuFlags(int enable_flags); // Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags| // should be a valid combination of the kCpuHas constants above and include // kCpuInitialized. Use this method when running in a sandboxed process where // the detection code might fail (as it might access /proc/cpuinfo). In such // cases the cpu_info can be obtained from a non sandboxed process by calling // InitCpuFlags() and passed to the sandboxed process (via command line // parameters, IPC...) which can then call this method to initialize the CPU // flags. // Notes: // - when specifying 0 for |cpu_flags|, the auto initialization is enabled // again. // - enabling CPU features that are not supported by the CPU will result in // undefined behavior. // TODO(fbarchard): consider writing a helper function that translates from // other library CPU info to libyuv CPU info and add a .md doc that explains // CPU detection. static __inline void SetCpuFlags(int cpu_flags) { LIBYUV_API extern int cpu_info_; #ifdef __ATOMIC_RELAXED __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED); #else cpu_info_ = cpu_flags; #endif } // Low level cpuid for X86. Returns zeros on other CPUs. // eax is the info type that you want. // ecx is typically the cpu number, and should normally be zero. LIBYUV_API void CpuId(int info_eax, int info_ecx, int* cpu_info); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_CPU_ID_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/macros_msa.h000066400000000000000000000250531416500237200227650ustar00rootroot00000000000000/* * Copyright 2016 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_MACROS_MSA_H_ #define INCLUDE_LIBYUV_MACROS_MSA_H_ #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #include #include #if (__mips_isa_rev >= 6) #define LW(psrc) \ ({ \ const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ uint32_t val_m; \ asm volatile("lw %[val_m], %[psrc_lw_m] \n" \ : [val_m] "=r"(val_m) \ : [psrc_lw_m] "m"(*psrc_lw_m)); \ val_m; \ }) #if (__mips == 64) #define LD(psrc) \ ({ \ const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ uint64_t val_m = 0; \ asm volatile("ld %[val_m], %[psrc_ld_m] \n" \ : [val_m] "=r"(val_m) \ : [psrc_ld_m] "m"(*psrc_ld_m)); \ val_m; \ }) #else // !(__mips == 64) #define LD(psrc) \ ({ \ const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ uint32_t val0_m, val1_m; \ uint64_t val_m = 0; \ val0_m = LW(psrc_ld_m); \ val1_m = LW(psrc_ld_m + 4); \ val_m = (uint64_t)(val1_m); /* NOLINT */ \ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ val_m; \ }) #endif // (__mips == 64) #define SW(val, pdst) \ ({ \ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ uint32_t val_m = (val); \ asm volatile("sw %[val_m], %[pdst_sw_m] \n" \ : [pdst_sw_m] "=m"(*pdst_sw_m) \ : [val_m] "r"(val_m)); \ }) #if (__mips == 64) #define SD(val, pdst) \ ({ \ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ uint64_t val_m = (val); \ asm volatile("sd %[val_m], %[pdst_sd_m] \n" \ : [pdst_sd_m] "=m"(*pdst_sd_m) \ : [val_m] "r"(val_m)); \ }) #else // !(__mips == 64) #define SD(val, pdst) \ ({ \ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ uint32_t val0_m, val1_m; \ val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ SW(val0_m, pdst_sd_m); \ SW(val1_m, pdst_sd_m + 4); \ }) #endif // !(__mips == 64) #else // !(__mips_isa_rev >= 6) #define LW(psrc) \ ({ \ const uint8_t* psrc_lw_m = (const uint8_t*)(psrc); \ uint32_t val_m; \ asm volatile("ulw %[val_m], %[psrc_lw_m] \n" \ : [val_m] "=r"(val_m) \ : [psrc_lw_m] "m"(*psrc_lw_m)); \ val_m; \ }) #if (__mips == 64) #define LD(psrc) \ ({ \ const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ uint64_t val_m = 0; \ asm volatile("uld %[val_m], %[psrc_ld_m] \n" \ : [val_m] "=r"(val_m) \ : [psrc_ld_m] "m"(*psrc_ld_m)); \ val_m; \ }) #else // !(__mips == 64) #define LD(psrc) \ ({ \ const uint8_t* psrc_ld_m = (const uint8_t*)(psrc); \ uint32_t val0_m, val1_m; \ uint64_t val_m = 0; \ val0_m = LW(psrc_ld_m); \ val1_m = LW(psrc_ld_m + 4); \ val_m = (uint64_t)(val1_m); /* NOLINT */ \ val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \ val_m = (uint64_t)(val_m | (uint64_t)val0_m); /* NOLINT */ \ val_m; \ }) #endif // (__mips == 64) #define SW(val, pdst) \ ({ \ uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \ uint32_t val_m = (val); \ asm volatile("usw %[val_m], %[pdst_sw_m] \n" \ : [pdst_sw_m] "=m"(*pdst_sw_m) \ : [val_m] "r"(val_m)); \ }) #define SD(val, pdst) \ ({ \ uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \ uint32_t val0_m, val1_m; \ val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \ val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \ SW(val0_m, pdst_sd_m); \ SW(val1_m, pdst_sd_m + 4); \ }) #endif // (__mips_isa_rev >= 6) // TODO(fbarchard): Consider removing __VAR_ARGS versions. #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ #define LD_UB(...) LD_B(const v16u8, __VA_ARGS__) #define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */ #define LD_UH(...) LD_H(const v8u16, __VA_ARGS__) #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) #define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */ #define ST_UH(...) ST_H(v8u16, __VA_ARGS__) /* Description : Load two vectors with 16 'byte' sized elements Arguments : Inputs - psrc, stride Outputs - out0, out1 Return Type - as per RTYPE Details : Load 16 byte elements in 'out0' from (psrc) Load 16 byte elements in 'out1' from (psrc + stride) */ #define LD_B2(RTYPE, psrc, stride, out0, out1) \ { \ out0 = LD_B(RTYPE, (psrc)); \ out1 = LD_B(RTYPE, (psrc) + stride); \ } #define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__) #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ { \ LD_B2(RTYPE, (psrc), stride, out0, out1); \ LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ } #define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__) /* Description : Store two vectors with stride each having 16 'byte' sized elements Arguments : Inputs - in0, in1, pdst, stride Details : Store 16 byte elements from 'in0' to (pdst) Store 16 byte elements from 'in1' to (pdst + stride) */ #define ST_B2(RTYPE, in0, in1, pdst, stride) \ { \ ST_B(RTYPE, in0, (pdst)); \ ST_B(RTYPE, in1, (pdst) + stride); \ } #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ { \ ST_B2(RTYPE, in0, in1, (pdst), stride); \ ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ } #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) /* Description : Store vectors of 8 halfword elements with stride Arguments : Inputs - in0, in1, pdst, stride Details : Store 8 halfword elements from 'in0' to (pdst) Store 8 halfword elements from 'in1' to (pdst + stride) */ #define ST_H2(RTYPE, in0, in1, pdst, stride) \ { \ ST_H(RTYPE, in0, (pdst)); \ ST_H(RTYPE, in1, (pdst) + stride); \ } #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) // TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly. /* Description : Shuffle byte vector elements as per mask vector Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 Outputs - out0, out1 Return Type - as per RTYPE Details : Byte elements from 'in0' & 'in1' are copied selectively to 'out0' as per control vector 'mask0' */ #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \ { \ out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \ } #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) /* Description : Interleave both left and right half of input vectors Arguments : Inputs - in0, in1 Outputs - out0, out1 Return Type - as per RTYPE Details : Right half of byte elements from 'in0' and 'in1' are interleaved and written to 'out0' */ #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \ { \ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ } #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) #endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */ #endif // INCLUDE_LIBYUV_MACROS_MSA_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/mjpeg_decoder.h000066400000000000000000000135031416500237200234250ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ #define INCLUDE_LIBYUV_MJPEG_DECODER_H_ #include "libyuv/basic_types.h" #ifdef __cplusplus // NOTE: For a simplified public API use convert.h MJPGToI420(). struct jpeg_common_struct; struct jpeg_decompress_struct; struct jpeg_source_mgr; namespace libyuv { #ifdef __cplusplus extern "C" { #endif LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size); #ifdef __cplusplus } // extern "C" #endif static const uint32_t kUnknownDataSize = 0xFFFFFFFF; enum JpegSubsamplingType { kJpegYuv420, kJpegYuv422, kJpegYuv444, kJpegYuv400, kJpegUnknown }; struct Buffer { const uint8_t* data; int len; }; struct BufferVector { Buffer* buffers; int len; int pos; }; struct SetJmpErrorMgr; // MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are // simply independent JPEG images with a fixed huffman table (which is omitted). // It is rarely used in video transmission, but is common as a camera capture // format, especially in Logitech devices. This class implements a decoder for // MJPEG frames. // // See http://tools.ietf.org/html/rfc2435 class LIBYUV_API MJpegDecoder { public: typedef void (*CallbackFunction)(void* opaque, const uint8_t* const* data, const int* strides, int rows); static const int kColorSpaceUnknown; static const int kColorSpaceGrayscale; static const int kColorSpaceRgb; static const int kColorSpaceYCbCr; static const int kColorSpaceCMYK; static const int kColorSpaceYCCK; MJpegDecoder(); ~MJpegDecoder(); // Loads a new frame, reads its headers, and determines the uncompressed // image format. // Returns LIBYUV_TRUE if image looks valid and format is supported. // If return value is LIBYUV_TRUE, then the values for all the following // getters are populated. // src_len is the size of the compressed mjpeg frame in bytes. LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len); // Returns width of the last loaded frame in pixels. int GetWidth(); // Returns height of the last loaded frame in pixels. int GetHeight(); // Returns format of the last loaded frame. The return value is one of the // kColorSpace* constants. int GetColorSpace(); // Number of color components in the color space. int GetNumComponents(); // Sample factors of the n-th component. int GetHorizSampFactor(int component); int GetVertSampFactor(int component); int GetHorizSubSampFactor(int component); int GetVertSubSampFactor(int component); // Public for testability. int GetImageScanlinesPerImcuRow(); // Public for testability. int GetComponentScanlinesPerImcuRow(int component); // Width of a component in bytes. int GetComponentWidth(int component); // Height of a component. int GetComponentHeight(int component); // Width of a component in bytes with padding for DCTSIZE. Public for testing. int GetComponentStride(int component); // Size of a component in bytes. int GetComponentSize(int component); // Call this after LoadFrame() if you decide you don't want to decode it // after all. LIBYUV_BOOL UnloadFrame(); // Decodes the entire image into a one-buffer-per-color-component format. // dst_width must match exactly. dst_height must be <= to image height; if // less, the image is cropped. "planes" must have size equal to at least // GetNumComponents() and they must point to non-overlapping buffers of size // at least GetComponentSize(i). The pointers in planes are incremented // to point to after the end of the written data. // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height); // Decodes the entire image and passes the data via repeated calls to a // callback function. Each call will get the data for a whole number of // image scanlines. // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded. LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque, int dst_width, int dst_height); // The helper function which recognizes the jpeg sub-sampling type. static JpegSubsamplingType JpegSubsamplingTypeHelper( int* subsample_x, int* subsample_y, int number_of_components); private: void AllocOutputBuffers(int num_outbufs); void DestroyOutputBuffers(); LIBYUV_BOOL StartDecode(); LIBYUV_BOOL FinishDecode(); void SetScanlinePointers(uint8_t** data); LIBYUV_BOOL DecodeImcuRow(); int GetComponentScanlinePadding(int component); // A buffer holding the input data for a frame. Buffer buf_; BufferVector buf_vec_; jpeg_decompress_struct* decompress_struct_; jpeg_source_mgr* source_mgr_; SetJmpErrorMgr* error_mgr_; // LIBYUV_TRUE iff at least one component has scanline padding. (i.e., // GetComponentScanlinePadding() != 0.) LIBYUV_BOOL has_scanline_padding_; // Temporaries used to point to scanline outputs. int num_outbufs_; // Outermost size of all arrays below. uint8_t*** scanlines_; int* scanlines_sizes_; // Temporary buffer used for decoding when we can't decode directly to the // output buffers. Large enough for just one iMCU row. uint8_t** databuf_; int* databuf_strides_; }; } // namespace libyuv #endif // __cplusplus #endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/planar_functions.h000066400000000000000000001017661416500237200242140ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ #include "libyuv/basic_types.h" // TODO(fbarchard): Remove the following headers includes. #include "libyuv/convert.h" #include "libyuv/convert_argb.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // TODO(fbarchard): Move cpu macros to row.h #if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__native_client__) && defined(__x86_64__)) || \ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) #define LIBYUV_DISABLE_X86 #endif #endif // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) #define HAS_ARGBAFFINEROW_SSE2 #endif // Copy a plane of data. LIBYUV_API void CopyPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int width, int height); LIBYUV_API void CopyPlane_16(const uint16_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, int width, int height); LIBYUV_API void Convert16To8Plane(const uint16_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int scale, // 16384 for 10 bits int width, int height); LIBYUV_API void Convert8To16Plane(const uint8_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, int scale, // 1024 for 10 bits int width, int height); // Set a plane of data to a 32 bit value. LIBYUV_API void SetPlane(uint8_t* dst_y, int dst_stride_y, int width, int height, uint32_t value); // Split interleaved UV plane into separate U and V planes. LIBYUV_API void SplitUVPlane(const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Merge separate U and V planes into one interleaved UV plane. LIBYUV_API void MergeUVPlane(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Split interleaved msb UV plane into separate lsb U and V planes. LIBYUV_API void SplitUVPlane_16(const uint16_t* src_uv, int src_stride_uv, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height, int depth); // Merge separate lsb U and V planes into one interleaved msb UV plane. LIBYUV_API void MergeUVPlane_16(const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_uv, int dst_stride_uv, int width, int height, int depth); // Convert lsb plane to msb plane LIBYUV_API void ConvertToMSBPlane_16(const uint16_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, int width, int height, int depth); // Convert msb plane to lsb plane LIBYUV_API void ConvertToLSBPlane_16(const uint16_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, int width, int height, int depth); // Scale U and V to half width and height and merge into interleaved UV plane. // width and height are source size, allowing odd sizes. // Use for converting I444 or I422 to NV12. LIBYUV_API void HalfMergeUVPlane(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Swap U and V channels in interleaved UV plane. LIBYUV_API void SwapUVPlane(const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_vu, int dst_stride_vu, int width, int height); // Split interleaved RGB plane into separate R, G and B planes. LIBYUV_API void SplitRGBPlane(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_r, int dst_stride_r, uint8_t* dst_g, int dst_stride_g, uint8_t* dst_b, int dst_stride_b, int width, int height); // Merge separate R, G and B planes into one interleaved RGB plane. LIBYUV_API void MergeRGBPlane(const uint8_t* src_r, int src_stride_r, const uint8_t* src_g, int src_stride_g, const uint8_t* src_b, int src_stride_b, uint8_t* dst_rgb, int dst_stride_rgb, int width, int height); // Split interleaved ARGB plane into separate R, G, B and A planes. // dst_a can be NULL to discard alpha plane. LIBYUV_API void SplitARGBPlane(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_r, int dst_stride_r, uint8_t* dst_g, int dst_stride_g, uint8_t* dst_b, int dst_stride_b, uint8_t* dst_a, int dst_stride_a, int width, int height); // Merge separate R, G, B and A planes into one interleaved ARGB plane. // src_a can be NULL to fill opaque value to alpha. LIBYUV_API void MergeARGBPlane(const uint8_t* src_r, int src_stride_r, const uint8_t* src_g, int src_stride_g, const uint8_t* src_b, int src_stride_b, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Merge separate 'depth' bit R, G and B planes stored in lsb // into one interleaved XR30 plane. // depth should in range [10, 16] LIBYUV_API void MergeXR30Plane(const uint16_t* src_r, int src_stride_r, const uint16_t* src_g, int src_stride_g, const uint16_t* src_b, int src_stride_b, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height, int depth); // Merge separate 'depth' bit R, G, B and A planes stored in lsb // into one interleaved AR64 plane. // src_a can be NULL to fill opaque value to alpha. // depth should in range [1, 16] LIBYUV_API void MergeAR64Plane(const uint16_t* src_r, int src_stride_r, const uint16_t* src_g, int src_stride_g, const uint16_t* src_b, int src_stride_b, const uint16_t* src_a, int src_stride_a, uint16_t* dst_ar64, int dst_stride_ar64, int width, int height, int depth); // Merge separate 'depth' bit R, G, B and A planes stored in lsb // into one interleaved ARGB plane. // src_a can be NULL to fill opaque value to alpha. // depth should in range [8, 16] LIBYUV_API void MergeARGB16To8Plane(const uint16_t* src_r, int src_stride_r, const uint16_t* src_g, int src_stride_g, const uint16_t* src_b, int src_stride_b, const uint16_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int depth); // Copy I400. Supports inverting. LIBYUV_API int I400ToI400(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int width, int height); #define J400ToJ400 I400ToI400 // Copy I422 to I422. #define I422ToI422 I422Copy LIBYUV_API int I422Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Copy I444 to I444. #define I444ToI444 I444Copy LIBYUV_API int I444Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Copy NV12. Supports inverting. int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Copy NV21. Supports inverting. int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height); // Convert YUY2 to I422. LIBYUV_API int YUY2ToI422(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Convert UYVY to I422. LIBYUV_API int UYVYToI422(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); LIBYUV_API int YUY2ToNV12(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); LIBYUV_API int UYVYToNV12(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Convert NV21 to NV12. LIBYUV_API int NV21ToNV12(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); LIBYUV_API int YUY2ToY(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_y, int dst_stride_y, int width, int height); // Convert I420 to I400. (calls CopyPlane ignoring u/v). LIBYUV_API int I420ToI400(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, int width, int height); // Alias #define J420ToJ400 I420ToI400 #define I420ToI420Mirror I420Mirror // I420 mirror. LIBYUV_API int I420Mirror(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Alias #define I400ToI400Mirror I400Mirror // I400 mirror. A single plane is mirrored horizontally. // Pass negative height to achieve 180 degree rotation. LIBYUV_API int I400Mirror(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int width, int height); // Alias #define NV12ToNV12Mirror NV12Mirror // NV12 mirror. LIBYUV_API int NV12Mirror(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Alias #define ARGBToARGBMirror ARGBMirror // ARGB mirror. LIBYUV_API int ARGBMirror(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Alias #define RGB24ToRGB24Mirror RGB24Mirror // RGB24 mirror. LIBYUV_API int RGB24Mirror(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height); // Mirror a plane of data. LIBYUV_API void MirrorPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int width, int height); // Mirror a plane of UV data. LIBYUV_API void MirrorUVPlane(const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_uv, int dst_stride_uv, int width, int height); // Alias #define RGB24ToRAW RAWToRGB24 LIBYUV_API int RAWToRGB24(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height); // Draw a rectangle into I420. LIBYUV_API int I420Rect(uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int x, int y, int width, int height, int value_y, int value_u, int value_v); // Draw a rectangle into ARGB. LIBYUV_API int ARGBRect(uint8_t* dst_argb, int dst_stride_argb, int dst_x, int dst_y, int width, int height, uint32_t value); // Convert ARGB to gray scale ARGB. LIBYUV_API int ARGBGrayTo(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Make a rectangle of ARGB gray scale. LIBYUV_API int ARGBGray(uint8_t* dst_argb, int dst_stride_argb, int dst_x, int dst_y, int width, int height); // Make a rectangle of ARGB Sepia tone. LIBYUV_API int ARGBSepia(uint8_t* dst_argb, int dst_stride_argb, int dst_x, int dst_y, int width, int height); // Apply a matrix rotation to each ARGB pixel. // matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2. // The first 4 coefficients apply to B, G, R, A and produce B of the output. // The next 4 coefficients apply to B, G, R, A and produce G of the output. // The next 4 coefficients apply to B, G, R, A and produce R of the output. // The last 4 coefficients apply to B, G, R, A and produce A of the output. LIBYUV_API int ARGBColorMatrix(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, const int8_t* matrix_argb, int width, int height); // Deprecated. Use ARGBColorMatrix instead. // Apply a matrix rotation to each ARGB pixel. // matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1. // The first 4 coefficients apply to B, G, R, A and produce B of the output. // The next 4 coefficients apply to B, G, R, A and produce G of the output. // The last 4 coefficients apply to B, G, R, A and produce R of the output. LIBYUV_API int RGBColorMatrix(uint8_t* dst_argb, int dst_stride_argb, const int8_t* matrix_rgb, int dst_x, int dst_y, int width, int height); // Apply a color table each ARGB pixel. // Table contains 256 ARGB values. LIBYUV_API int ARGBColorTable(uint8_t* dst_argb, int dst_stride_argb, const uint8_t* table_argb, int dst_x, int dst_y, int width, int height); // Apply a color table each ARGB pixel but preserve destination alpha. // Table contains 256 ARGB values. LIBYUV_API int RGBColorTable(uint8_t* dst_argb, int dst_stride_argb, const uint8_t* table_argb, int dst_x, int dst_y, int width, int height); // Apply a luma/color table each ARGB pixel but preserve destination alpha. // Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from // RGB (YJ style) and C is an 8 bit color component (R, G or B). LIBYUV_API int ARGBLumaColorTable(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, const uint8_t* luma, int width, int height); // Apply a 3 term polynomial to ARGB values. // poly points to a 4x4 matrix. The first row is constants. The 2nd row is // coefficients for b, g, r and a. The 3rd row is coefficients for b squared, // g squared, r squared and a squared. The 4rd row is coefficients for b to // the 3, g to the 3, r to the 3 and a to the 3. The values are summed and // result clamped to 0 to 255. // A polynomial approximation can be dirived using software such as 'R'. LIBYUV_API int ARGBPolynomial(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, const float* poly, int width, int height); // Convert plane of 16 bit shorts to half floats. // Source values are multiplied by scale before storing as half float. LIBYUV_API int HalfFloatPlane(const uint16_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, float scale, int width, int height); // Convert a buffer of bytes to floats, scale the values and store as floats. LIBYUV_API int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width); // Quantize a rectangle of ARGB. Alpha unaffected. // scale is a 16 bit fractional fixed point scaler between 0 and 65535. // interval_size should be a value between 1 and 255. // interval_offset should be a value between 0 and 255. LIBYUV_API int ARGBQuantize(uint8_t* dst_argb, int dst_stride_argb, int scale, int interval_size, int interval_offset, int dst_x, int dst_y, int width, int height); // Copy ARGB to ARGB. LIBYUV_API int ARGBCopy(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Copy Alpha channel of ARGB to alpha of ARGB. LIBYUV_API int ARGBCopyAlpha(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Extract the alpha channel from ARGB. LIBYUV_API int ARGBExtractAlpha(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_a, int dst_stride_a, int width, int height); // Copy Y channel to Alpha of ARGB. LIBYUV_API int ARGBCopyYToAlpha(const uint8_t* src_y, int src_stride_y, uint8_t* dst_argb, int dst_stride_argb, int width, int height); typedef void (*ARGBBlendRow)(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); // Get function to Alpha Blend ARGB pixels and store to destination. LIBYUV_API ARGBBlendRow GetARGBBlend(); // Alpha Blend ARGB images and store to destination. // Source is pre-multiplied by alpha using ARGBAttenuate. // Alpha of destination is set to 255. LIBYUV_API int ARGBBlend(const uint8_t* src_argb0, int src_stride_argb0, const uint8_t* src_argb1, int src_stride_argb1, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Alpha Blend plane and store to destination. // Source is not pre-multiplied by alpha. LIBYUV_API int BlendPlane(const uint8_t* src_y0, int src_stride_y0, const uint8_t* src_y1, int src_stride_y1, const uint8_t* alpha, int alpha_stride, uint8_t* dst_y, int dst_stride_y, int width, int height); // Alpha Blend YUV images and store to destination. // Source is not pre-multiplied by alpha. // Alpha is full width x height and subsampled to half size to apply to UV. LIBYUV_API int I420Blend(const uint8_t* src_y0, int src_stride_y0, const uint8_t* src_u0, int src_stride_u0, const uint8_t* src_v0, int src_stride_v0, const uint8_t* src_y1, int src_stride_y1, const uint8_t* src_u1, int src_stride_u1, const uint8_t* src_v1, int src_stride_v1, const uint8_t* alpha, int alpha_stride, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height); // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255. LIBYUV_API int ARGBMultiply(const uint8_t* src_argb0, int src_stride_argb0, const uint8_t* src_argb1, int src_stride_argb1, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Add ARGB image with ARGB image. Saturates to 255. LIBYUV_API int ARGBAdd(const uint8_t* src_argb0, int src_stride_argb0, const uint8_t* src_argb1, int src_stride_argb1, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0. LIBYUV_API int ARGBSubtract(const uint8_t* src_argb0, int src_stride_argb0, const uint8_t* src_argb1, int src_stride_argb1, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert I422 to YUY2. LIBYUV_API int I422ToYUY2(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_yuy2, int dst_stride_yuy2, int width, int height); // Convert I422 to UYVY. LIBYUV_API int I422ToUYVY(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uyvy, int dst_stride_uyvy, int width, int height); // Convert unattentuated ARGB to preattenuated ARGB. LIBYUV_API int ARGBAttenuate(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Convert preattentuated ARGB to unattenuated ARGB. LIBYUV_API int ARGBUnattenuate(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Internal function - do not call directly. // Computes table of cumulative sum for image where the value is the sum // of all values above and to the left of the entry. Used by ARGBBlur. LIBYUV_API int ARGBComputeCumulativeSum(const uint8_t* src_argb, int src_stride_argb, int32_t* dst_cumsum, int dst_stride32_cumsum, int width, int height); // Blur ARGB image. // dst_cumsum table of width * (height + 1) * 16 bytes aligned to // 16 byte boundary. // dst_stride32_cumsum is number of ints in a row (width * 4). // radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. // Blur is optimized for radius of 5 (11x11) or less. LIBYUV_API int ARGBBlur(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int32_t* dst_cumsum, int dst_stride32_cumsum, int width, int height, int radius); // Gaussian 5x5 blur a float plane. // Coefficients of 1, 4, 6, 4, 1. // Each destination pixel is a blur of the 5x5 // pixels from the source. // Source edges are clamped. LIBYUV_API int GaussPlane_F32(const float* src, int src_stride, float* dst, int dst_stride, int width, int height); // Multiply ARGB image by ARGB value. LIBYUV_API int ARGBShade(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height, uint32_t value); // Interpolate between two images using specified amount of interpolation // (0 to 255) and store to destination. // 'interpolation' is specified as 8 bit fraction where 0 means 100% src0 // and 255 means 1% src0 and 99% src1. LIBYUV_API int InterpolatePlane(const uint8_t* src0, int src_stride0, const uint8_t* src1, int src_stride1, uint8_t* dst, int dst_stride, int width, int height, int interpolation); // Interpolate between two ARGB images using specified amount of interpolation // Internally calls InterpolatePlane with width * 4 (bpp). LIBYUV_API int ARGBInterpolate(const uint8_t* src_argb0, int src_stride_argb0, const uint8_t* src_argb1, int src_stride_argb1, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int interpolation); // Interpolate between two YUV images using specified amount of interpolation // Internally calls InterpolatePlane on each plane where the U and V planes // are half width and half height. LIBYUV_API int I420Interpolate(const uint8_t* src0_y, int src0_stride_y, const uint8_t* src0_u, int src0_stride_u, const uint8_t* src0_v, int src0_stride_v, const uint8_t* src1_y, int src1_stride_y, const uint8_t* src1_u, int src1_stride_u, const uint8_t* src1_v, int src1_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, int interpolation); // Row function for copying pixels from a source with a slope to a row // of destination. Useful for scaling, rotation, mirror, texture mapping. LIBYUV_API void ARGBAffineRow_C(const uint8_t* src_argb, int src_argb_stride, uint8_t* dst_argb, const float* uv_dudv, int width); // TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, int src_argb_stride, uint8_t* dst_argb, const float* uv_dudv, int width); // Shuffle ARGB channel order. e.g. BGRA to ARGB. // shuffler is 16 bytes. LIBYUV_API int ARGBShuffle(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_argb, int dst_stride_argb, const uint8_t* shuffler, int width, int height); // Shuffle AR64 channel order. e.g. AR64 to AB64. // shuffler is 16 bytes. LIBYUV_API int AR64Shuffle(const uint16_t* src_ar64, int src_stride_ar64, uint16_t* dst_ar64, int dst_stride_ar64, const uint8_t* shuffler, int width, int height); // Sobel ARGB effect with planar output. LIBYUV_API int ARGBSobelToPlane(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, int width, int height); // Sobel ARGB effect. LIBYUV_API int ARGBSobel(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height); // Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB. LIBYUV_API int ARGBSobelXY(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/rotate.h000066400000000000000000000147661416500237200221500ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_ROTATE_H_ #define INCLUDE_LIBYUV_ROTATE_H_ #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Supported rotation. typedef enum RotationMode { kRotate0 = 0, // No rotation. kRotate90 = 90, // Rotate 90 degrees clockwise. kRotate180 = 180, // Rotate 180 degrees. kRotate270 = 270, // Rotate 270 degrees clockwise. // Deprecated. kRotateNone = 0, kRotateClockwise = 90, kRotateCounterClockwise = 270, } RotationModeEnum; // Rotate I420 frame. LIBYUV_API int I420Rotate(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, enum RotationMode mode); // Rotate I444 frame. LIBYUV_API int I444Rotate(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, enum RotationMode mode); // Rotate NV12 input and store in I420. LIBYUV_API int NV12ToI420Rotate(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, enum RotationMode mode); // Convert Android420 to I420 with rotation. // "rotation" can be 0, 90, 180 or 270. LIBYUV_API int Android420ToI420Rotate(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, enum RotationMode rotation); // Rotate a plane by 0, 90, 180, or 270. LIBYUV_API int RotatePlane(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height, enum RotationMode mode); // Rotate planes by 90, 180, 270. Deprecated. LIBYUV_API void RotatePlane90(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height); LIBYUV_API void RotatePlane180(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height); LIBYUV_API void RotatePlane270(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height); // Rotations for when U and V are interleaved. // These functions take one UV input pointer and // split the data into two buffers while // rotating them. // width and height expected to be half size for NV12. LIBYUV_API int SplitRotateUV(const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, enum RotationMode mode); LIBYUV_API void SplitRotateUV90(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width, int height); LIBYUV_API void SplitRotateUV180(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width, int height); LIBYUV_API void SplitRotateUV270(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width, int height); // The 90 and 270 functions are based on transposes. // Doing a transpose with reversing the read/write // order will result in a rotation by +- 90 degrees. // Deprecated. LIBYUV_API void TransposePlane(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height); LIBYUV_API void SplitTransposeUV(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width, int height); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_ROTATE_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/rotate_argb.h000066400000000000000000000017671416500237200231400ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ #define INCLUDE_LIBYUV_ROTATE_ARGB_H_ #include "libyuv/basic_types.h" #include "libyuv/rotate.h" // For RotationMode. #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Rotate ARGB frame LIBYUV_API int ARGBRotate(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int src_width, int src_height, enum RotationMode mode); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/rotate_row.h000066400000000000000000000176721416500237200230360ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ #define INCLUDE_LIBYUV_ROTATE_ROW_H_ #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__native_client__) && defined(__x86_64__)) || \ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif #if defined(__native_client__) #define LIBYUV_DISABLE_NEON #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) #define LIBYUV_DISABLE_X86 #endif #endif // The following are available for Visual C 32 bit: #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ !defined(__clang__) #define HAS_TRANSPOSEWX8_SSSE3 #define HAS_TRANSPOSEUVWX8_SSE2 #endif // The following are available for GCC 32 or 64 bit: #if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__)) #define HAS_TRANSPOSEWX8_SSSE3 #endif // The following are available for 64 bit GCC: #if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) #define HAS_TRANSPOSEWX8_FAST_SSSE3 #define HAS_TRANSPOSEUVWX8_SSE2 #endif #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_TRANSPOSEWX8_NEON #define HAS_TRANSPOSEUVWX8_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_TRANSPOSEWX16_MSA #define HAS_TRANSPOSEUVWX16_MSA #endif #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #define HAS_TRANSPOSEWX8_MMI #define HAS_TRANSPOSEUVWX8_MMI #endif void TransposeWxH_C(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height); void TransposeWx8_C(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeWx16_C(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeWx8_NEON(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeWx8_SSSE3(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeWx8_MMI(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeWx8_Fast_SSSE3(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeWx16_MSA(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeWx8_Any_NEON(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeWx8_Any_SSSE3(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeWx8_Any_MMI(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeWx16_Any_MSA(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); void TransposeUVWxH_C(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width, int height); void TransposeUVWx8_C(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width); void TransposeUVWx16_C(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width); void TransposeUVWx8_SSE2(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width); void TransposeUVWx8_NEON(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width); void TransposeUVWx8_MMI(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width); void TransposeUVWx16_MSA(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width); void TransposeUVWx8_Any_SSE2(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width); void TransposeUVWx8_Any_NEON(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width); void TransposeUVWx8_Any_MMI(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width); void TransposeUVWx16_Any_MSA(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/row.h000066400000000000000000007161301416500237200214530ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_ROW_H_ #define INCLUDE_LIBYUV_ROW_H_ #include // For malloc. #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__native_client__) && defined(__x86_64__)) || \ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif #if defined(__native_client__) #define LIBYUV_DISABLE_NEON #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) #define LIBYUV_DISABLE_X86 #endif #endif // clang >= 3.5.0 required for Arm64. #if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) #if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) #define LIBYUV_DISABLE_NEON #endif // clang >= 3.5 #endif // __clang__ // GCC >= 4.7.0 required for AVX2. #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) #define GCC_HAS_AVX2 1 #endif // GNUC >= 4.7 #endif // __GNUC__ // clang >= 3.4.0 required for AVX2. #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) #if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) #define CLANG_HAS_AVX2 1 #endif // clang >= 3.4 #endif // __clang__ // clang >= 6.0.0 required for AVX512. #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) // clang in xcode follows a different versioning scheme. // TODO(fbarchard): fix xcode 9 ios b/789. #if (__clang_major__ >= 7) && !defined(__APPLE__) #define CLANG_HAS_AVX512 1 #endif // clang >= 7 #endif // __clang__ // Visual C 2012 required for AVX2. #if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) // Conversions: #define HAS_ABGRTOYROW_SSSE3 #if !defined(LIBYUV_BIT_EXACT) #define HAS_ABGRTOUVROW_SSSE3 #endif #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 #define HAS_ARGBEXTRACTALPHAROW_SSE2 #define HAS_ARGBSETROW_X86 #define HAS_ARGBSHUFFLEROW_SSSE3 #define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2 #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB565DITHERROW_SSE2 #define HAS_ARGBTORGB565ROW_SSE2 #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #if !defined(LIBYUV_BIT_EXACT) #define HAS_ARGBTOUV444ROW_SSSE3 #define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 #endif #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_H422TOARGBROW_SSSE3 #define HAS_HALFFLOATROW_SSE2 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TORGB24ROW_SSSE3 #define HAS_I422TORGB565ROW_SSSE3 #define HAS_I422TORGBAROW_SSSE3 #define HAS_I422TOUYVYROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2 #define HAS_I444TOARGBROW_SSSE3 #define HAS_J400TOARGBROW_SSE2 #define HAS_J422TOARGBROW_SSSE3 #define HAS_MERGEUVROW_SSE2 #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORSPLITUVROW_SSSE3 #define HAS_NV12TOARGBROW_SSSE3 #define HAS_NV12TORGB24ROW_SSSE3 #define HAS_NV12TORGB565ROW_SSSE3 #define HAS_NV21TOARGBROW_SSSE3 #define HAS_NV21TORGB24ROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTORGB24ROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 #define HAS_RAWTOYROW_SSSE3 #define HAS_RGB24TOYROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 #if !defined(LIBYUV_BIT_EXACT) #define HAS_RGB24TOYJROW_SSSE3 #define HAS_RAWTOYJROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3 #endif #define HAS_SETROW_ERMS #define HAS_SETROW_X86 #define HAS_SPLITUVROW_SSE2 #define HAS_UYVYTOARGBROW_SSSE3 #define HAS_UYVYTOUV422ROW_SSE2 #define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOYROW_SSE2 #define HAS_YUY2TOARGBROW_SSSE3 #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 // Effects: #define HAS_ARGBADDROW_SSE2 #define HAS_ARGBAFFINEROW_SSE2 #if !defined(LIBYUV_BIT_EXACT) #define HAS_ARGBATTENUATEROW_SSSE3 #endif #define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBCOLORTABLEROW_X86 #define HAS_ARGBCOPYALPHAROW_SSE2 #define HAS_ARGBCOPYYTOALPHAROW_SSE2 #define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBLUMACOLORTABLEROW_SSSE3 #define HAS_ARGBMIRRORROW_SSE2 #define HAS_ARGBMULTIPLYROW_SSE2 #define HAS_ARGBPOLYNOMIALROW_SSE2 #define HAS_ARGBQUANTIZEROW_SSE2 #define HAS_ARGBSEPIAROW_SSSE3 #define HAS_ARGBSHADEROW_SSE2 #define HAS_ARGBSUBTRACTROW_SSE2 #define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_BLENDPLANEROW_SSSE3 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #define HAS_INTERPOLATEROW_SSSE3 #define HAS_RGBCOLORTABLEROW_X86 #define HAS_SOBELROW_SSE2 #define HAS_SOBELTOPLANEROW_SSE2 #define HAS_SOBELXROW_SSE2 #define HAS_SOBELXYROW_SSE2 #define HAS_SOBELYROW_SSE2 // The following functions fail on gcc/clang 32 bit with fpic and framepointer. // caveat: clangcl uses row_win.cc which works. #if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ defined(_MSC_VER) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 #define HAS_I422ALPHATOARGBROW_SSSE3 #define HAS_I444ALPHATOARGBROW_SSSE3 #endif #endif // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. #if !defined(LIBYUV_DISABLE_X86) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ defined(GCC_HAS_AVX2)) #define HAS_ARGBCOPYALPHAROW_AVX2 #define HAS_ARGBCOPYYTOALPHAROW_AVX2 #define HAS_ARGBEXTRACTALPHAROW_AVX2 #define HAS_ARGBMIRRORROW_AVX2 #define HAS_ARGBPOLYNOMIALROW_AVX2 #define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBTORGB565DITHERROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_RAWTOYJROW_AVX2 #define HAS_RGB24TOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 #if !defined(LIBYUV_BIT_EXACT) #define HAS_ARGBTOUVJROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 #endif #define HAS_COPYROW_AVX #define HAS_H422TOARGBROW_AVX2 #define HAS_HALFFLOATROW_AVX2 // #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGBROW_AVX2 #define HAS_I422TORGB24ROW_AVX2 #define HAS_I422TORGB565ROW_AVX2 #define HAS_I422TORGBAROW_AVX2 #define HAS_I444TOARGBROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 #define HAS_J422TOARGBROW_AVX2 #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 #define HAS_NV12TORGB24ROW_AVX2 #define HAS_NV12TORGB565ROW_AVX2 #define HAS_NV21TOARGBROW_AVX2 #define HAS_NV21TORGB24ROW_AVX2 #define HAS_SPLITUVROW_AVX2 #define HAS_UYVYTOARGBROW_AVX2 #define HAS_UYVYTOUV422ROW_AVX2 #define HAS_UYVYTOUVROW_AVX2 #define HAS_UYVYTOYROW_AVX2 #define HAS_YUY2TOARGBROW_AVX2 #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 // Effects: #define HAS_ARGBADDROW_AVX2 #if !defined(LIBYUV_BIT_EXACT) #define HAS_ARGBATTENUATEROW_AVX2 #endif #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 #define HAS_BLENDPLANEROW_AVX2 #if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \ defined(_MSC_VER) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 #define HAS_I422ALPHATOARGBROW_AVX2 #define HAS_I444ALPHATOARGBROW_AVX2 #endif #endif // The following are available for AVX2 Visual C 32 bit: // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \ !defined(__clang__) && defined(VISUALC_HAS_AVX2) #define HAS_ARGB1555TOARGBROW_AVX2 #define HAS_ARGB4444TOARGBROW_AVX2 #define HAS_ARGBTOARGB1555ROW_AVX2 #define HAS_ARGBTOARGB4444ROW_AVX2 #define HAS_ARGBTORGB565ROW_AVX2 #define HAS_J400TOARGBROW_AVX2 #define HAS_RGB565TOARGBROW_AVX2 #endif // The following are also available on x64 Visual C. #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \ (!defined(__clang__) || defined(__SSSE3__)) #define HAS_I444ALPHATOARGBROW_SSSE3 #define HAS_I444TOARGBROW_SSSE3 #define HAS_I422ALPHATOARGBROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 #endif // The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #define HAS_ABGRTOAR30ROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_ARGBTOAR64ROW_SSSE3 #define HAS_ARGBTOAB64ROW_SSSE3 #define HAS_AR64TOARGBROW_SSSE3 #define HAS_AB64TOARGBROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 #define HAS_HALFMERGEUVROW_SSSE3 #define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3 #define HAS_I212TOAR30ROW_SSSE3 #define HAS_I212TOARGBROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOAR30ROW_SSSE3 #define HAS_I410TOAR30ROW_SSSE3 #define HAS_I410TOARGBROW_SSSE3 #define HAS_MERGEARGBROW_SSE2 #define HAS_MERGEXRGBROW_SSE2 #define HAS_MERGERGBROW_SSSE3 #define HAS_MIRRORUVROW_SSSE3 #define HAS_NV21TOYUV24ROW_SSSE3 #define HAS_P210TOAR30ROW_SSSE3 #define HAS_P210TOARGBROW_SSSE3 #define HAS_P410TOAR30ROW_SSSE3 #define HAS_P410TOARGBROW_SSSE3 #define HAS_RAWTORGBAROW_SSSE3 #define HAS_RGB24MIRRORROW_SSSE3 #if !defined(LIBYUV_BIT_EXACT) #define HAS_RGBATOYJROW_SSSE3 #endif #define HAS_SPLITARGBROW_SSE2 #define HAS_SPLITARGBROW_SSSE3 #define HAS_SPLITXRGBROW_SSE2 #define HAS_SPLITXRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 #if defined(__x86_64__) || !defined(__pic__) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 #define HAS_I210ALPHATOARGBROW_SSSE3 #define HAS_I410ALPHATOARGBROW_SSSE3 #endif #endif // The following are available for AVX2 gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || defined(__i386__)) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_ABGRTOAR30ROW_AVX2 #if !defined(LIBYUV_BIT_EXACT) #define HAS_ABGRTOUVROW_AVX2 #define HAS_ABGRTOYROW_AVX2 #endif #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 #define HAS_ARGBTOAR64ROW_AVX2 #define HAS_ARGBTOAB64ROW_AVX2 #define HAS_AR64TOARGBROW_AVX2 #define HAS_AB64TOARGBROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 #define HAS_DIVIDEROW_16_AVX2 #define HAS_HALFMERGEUVROW_AVX2 #define HAS_MERGEAR64ROW_AVX2 #define HAS_MERGEARGB16TO8ROW_AVX2 #define HAS_MERGEARGBROW_AVX2 #define HAS_MERGEXR30ROW_AVX2 #define HAS_MERGEXR64ROW_AVX2 #define HAS_MERGEXRGB16TO8ROW_AVX2 #define HAS_MERGEXRGBROW_AVX2 #define HAS_NV21TOYUV24ROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 #define HAS_I212TOAR30ROW_AVX2 #define HAS_I212TOARGBROW_AVX2 #define HAS_I400TOARGBROW_AVX2 #define HAS_I410TOAR30ROW_AVX2 #define HAS_I410TOARGBROW_AVX2 #define HAS_P210TOAR30ROW_AVX2 #define HAS_P210TOARGBROW_AVX2 #define HAS_P410TOAR30ROW_AVX2 #define HAS_P410TOARGBROW_AVX2 #define HAS_I422TOAR30ROW_AVX2 #define HAS_I422TOUYVYROW_AVX2 #define HAS_I422TOYUY2ROW_AVX2 #define HAS_MERGEUVROW_16_AVX2 #define HAS_MIRRORUVROW_AVX2 #define HAS_MULTIPLYROW_16_AVX2 #if !defined(LIBYUV_BIT_EXACT) #define HAS_RGBATOYJROW_AVX2 #endif #define HAS_SPLITARGBROW_AVX2 #define HAS_SPLITXRGBROW_AVX2 #define HAS_SPLITUVROW_16_AVX2 #define HAS_SWAPUVROW_AVX2 #if defined(__x86_64__) || !defined(__pic__) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 #define HAS_I210ALPHATOARGBROW_AVX2 #define HAS_I410ALPHATOARGBROW_AVX2 #endif #endif // The following are available for AVX512 clang x86 platforms: // TODO(fbarchard): Port to GCC and Visual C // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789 #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512)) #define HAS_ARGBTORGB24ROW_AVX512VBMI #endif // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOYROW_NEON #define HAS_ARGB1555TOARGBROW_NEON #define HAS_ARGB1555TOUVROW_NEON #define HAS_ARGB1555TOYROW_NEON #define HAS_ARGB4444TOARGBROW_NEON #define HAS_ARGB4444TOUVROW_NEON #define HAS_ARGB4444TOYROW_NEON #define HAS_ARGBEXTRACTALPHAROW_NEON #define HAS_ARGBSETROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON #define HAS_ARGBTORAWROW_NEON #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB565DITHERROW_NEON #define HAS_ARGBTORGB565ROW_NEON #define HAS_ARGBTOAR64ROW_NEON #define HAS_ARGBTOAB64ROW_NEON #define HAS_AR64TOARGBROW_NEON #define HAS_AB64TOARGBROW_NEON #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYROW_NEON #define HAS_AYUVTOUVROW_NEON #define HAS_AYUVTOVUROW_NEON #define HAS_AYUVTOYROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_BYTETOFLOATROW_NEON #define HAS_COPYROW_NEON #define HAS_DIVIDEROW_16_NEON #define HAS_HALFFLOATROW_NEON #define HAS_HALFMERGEUVROW_NEON #define HAS_I400TOARGBROW_NEON #define HAS_I444ALPHATOARGBROW_NEON #define HAS_I422ALPHATOARGBROW_NEON #define HAS_I422TOARGB1555ROW_NEON #define HAS_I422TOARGB4444ROW_NEON #define HAS_I422TOARGBROW_NEON #define HAS_I422TORGB24ROW_NEON #define HAS_I422TORGB565ROW_NEON #define HAS_I422TORGBAROW_NEON #define HAS_I422TOUYVYROW_NEON #define HAS_I422TOYUY2ROW_NEON #define HAS_I444TOARGBROW_NEON #define HAS_J400TOARGBROW_NEON #define HAS_MERGEAR64ROW_NEON #define HAS_MERGEARGB16TO8ROW_NEON #define HAS_MERGEARGBROW_NEON #define HAS_MERGEXR30ROW_NEON #define HAS_MERGEXR64ROW_NEON #define HAS_MERGEXRGB16TO8ROW_NEON #define HAS_MERGEXRGBROW_NEON #define HAS_MERGEUVROW_NEON #define HAS_MERGEUVROW_16_NEON #define HAS_MIRRORROW_NEON #define HAS_MIRRORUVROW_NEON #define HAS_MIRRORSPLITUVROW_NEON #define HAS_MULTIPLYROW_16_NEON #define HAS_NV12TOARGBROW_NEON #define HAS_NV12TORGB24ROW_NEON #define HAS_NV12TORGB565ROW_NEON #define HAS_NV21TOARGBROW_NEON #define HAS_NV21TORGB24ROW_NEON #define HAS_NV21TOYUV24ROW_NEON #define HAS_RAWTOARGBROW_NEON #define HAS_RAWTORGB24ROW_NEON #define HAS_RAWTORGBAROW_NEON #define HAS_RAWTOUVROW_NEON #define HAS_RAWTOUVJROW_NEON #define HAS_RAWTOYJROW_NEON #define HAS_RAWTOYROW_NEON #define HAS_RGB24TOARGBROW_NEON #define HAS_RGB24TOUVROW_NEON #define HAS_RGB24TOUVJROW_NEON #define HAS_RGB24TOYJROW_NEON #define HAS_RGB24TOYROW_NEON #define HAS_RGB565TOARGBROW_NEON #define HAS_RGB565TOUVROW_NEON #define HAS_RGB565TOYROW_NEON #define HAS_RGBATOUVROW_NEON #define HAS_RGBATOYJROW_NEON #define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON #define HAS_SPLITARGBROW_NEON #define HAS_SPLITXRGBROW_NEON #define HAS_SPLITRGBROW_NEON #define HAS_SPLITUVROW_NEON #define HAS_SPLITUVROW_16_NEON #define HAS_SWAPUVROW_NEON #define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOYROW_NEON #define HAS_YUY2TOARGBROW_NEON #define HAS_YUY2TOUV422ROW_NEON #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOYROW_NEON // Effects: #define HAS_ARGBADDROW_NEON #define HAS_ARGBATTENUATEROW_NEON #define HAS_ARGBBLENDROW_NEON #define HAS_ARGBCOLORMATRIXROW_NEON #define HAS_ARGBGRAYROW_NEON #define HAS_ARGBMIRRORROW_NEON #define HAS_RGB24MIRRORROW_NEON #define HAS_ARGBMULTIPLYROW_NEON #define HAS_ARGBQUANTIZEROW_NEON #define HAS_ARGBSEPIAROW_NEON #define HAS_ARGBSHADEROW_NEON #define HAS_ARGBSHUFFLEROW_NEON #define HAS_ARGBSUBTRACTROW_NEON #define HAS_INTERPOLATEROW_NEON #define HAS_SOBELROW_NEON #define HAS_SOBELTOPLANEROW_NEON #define HAS_SOBELXROW_NEON #define HAS_SOBELXYROW_NEON #define HAS_SOBELYROW_NEON #endif // The following are available on AArch64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #define HAS_SCALESUMSAMPLES_NEON #define HAS_GAUSSROW_F32_NEON #define HAS_GAUSSCOL_F32_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_ABGRTOUVROW_MSA #define HAS_ABGRTOYROW_MSA #define HAS_ARGB1555TOARGBROW_MSA #define HAS_ARGB1555TOUVROW_MSA #define HAS_ARGB1555TOYROW_MSA #define HAS_ARGB4444TOARGBROW_MSA #define HAS_ARGBADDROW_MSA #define HAS_ARGBATTENUATEROW_MSA #define HAS_ARGBBLENDROW_MSA #define HAS_ARGBCOLORMATRIXROW_MSA #define HAS_ARGBEXTRACTALPHAROW_MSA #define HAS_ARGBGRAYROW_MSA #define HAS_ARGBMIRRORROW_MSA #define HAS_ARGBMULTIPLYROW_MSA #define HAS_ARGBQUANTIZEROW_MSA #define HAS_ARGBSEPIAROW_MSA #define HAS_ARGBSETROW_MSA #define HAS_ARGBSHADEROW_MSA #define HAS_ARGBSHUFFLEROW_MSA #define HAS_ARGBSUBTRACTROW_MSA #define HAS_ARGBTOARGB1555ROW_MSA #define HAS_ARGBTOARGB4444ROW_MSA #define HAS_ARGBTORAWROW_MSA #define HAS_ARGBTORGB24ROW_MSA #define HAS_ARGBTORGB565DITHERROW_MSA #define HAS_ARGBTORGB565ROW_MSA #define HAS_ARGBTOUV444ROW_MSA #define HAS_ARGBTOUVJROW_MSA #define HAS_ARGBTOUVROW_MSA #define HAS_ARGBTOYJROW_MSA #define HAS_ARGBTOYROW_MSA #define HAS_BGRATOUVROW_MSA #define HAS_BGRATOYROW_MSA #define HAS_HALFFLOATROW_MSA #define HAS_I400TOARGBROW_MSA #define HAS_I422TOUYVYROW_MSA #define HAS_I422TOYUY2ROW_MSA #define HAS_INTERPOLATEROW_MSA #define HAS_J400TOARGBROW_MSA #define HAS_MERGEUVROW_MSA #define HAS_MIRRORROW_MSA #define HAS_MIRRORUVROW_MSA #define HAS_MIRRORSPLITUVROW_MSA #define HAS_RAWTOARGBROW_MSA #define HAS_RAWTORGB24ROW_MSA #define HAS_RAWTOUVROW_MSA #define HAS_RAWTOYROW_MSA #define HAS_RGB24TOARGBROW_MSA #define HAS_RGB24TOUVROW_MSA #define HAS_RGB24TOYROW_MSA #define HAS_RGB565TOARGBROW_MSA #define HAS_RGB565TOUVROW_MSA #define HAS_RGB565TOYROW_MSA #define HAS_RGBATOUVROW_MSA #define HAS_RGBATOYROW_MSA #define HAS_SETROW_MSA #define HAS_SOBELROW_MSA #define HAS_SOBELTOPLANEROW_MSA #define HAS_SOBELXROW_MSA #define HAS_SOBELXYROW_MSA #define HAS_SOBELYROW_MSA #define HAS_SPLITUVROW_MSA #define HAS_UYVYTOUVROW_MSA #define HAS_UYVYTOYROW_MSA #define HAS_YUY2TOUV422ROW_MSA #define HAS_YUY2TOUVROW_MSA #define HAS_YUY2TOYROW_MSA #endif #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #define HAS_ABGRTOUVROW_MMI #define HAS_ABGRTOYROW_MMI #define HAS_ARGB1555TOARGBROW_MMI #define HAS_ARGB1555TOUVROW_MMI #define HAS_ARGB1555TOYROW_MMI #define HAS_ARGB4444TOARGBROW_MMI #define HAS_ARGB4444TOUVROW_MMI #define HAS_ARGB4444TOYROW_MMI #define HAS_ARGBADDROW_MMI #define HAS_ARGBATTENUATEROW_MMI #define HAS_ARGBBLENDROW_MMI #define HAS_ARGBCOLORMATRIXROW_MMI #define HAS_ARGBCOPYALPHAROW_MMI #define HAS_ARGBCOPYYTOALPHAROW_MMI #define HAS_ARGBEXTRACTALPHAROW_MMI #define HAS_ARGBGRAYROW_MMI #define HAS_ARGBMIRRORROW_MMI #define HAS_ARGBMULTIPLYROW_MMI #define HAS_ARGBSEPIAROW_MMI #define HAS_ARGBSETROW_MMI #define HAS_ARGBSHADEROW_MMI #define HAS_ARGBSHUFFLEROW_MMI #define HAS_ARGBSUBTRACTROW_MMI #define HAS_ARGBTOARGB1555ROW_MMI #define HAS_ARGBTOARGB4444ROW_MMI #define HAS_ARGBTORAWROW_MMI #define HAS_ARGBTORGB24ROW_MMI #define HAS_ARGBTORGB565DITHERROW_MMI #define HAS_ARGBTORGB565ROW_MMI #define HAS_ARGBTOUV444ROW_MMI #define HAS_ARGBTOUVJROW_MMI #define HAS_ARGBTOUVROW_MMI #define HAS_ARGBTOYJROW_MMI #define HAS_ARGBTOYROW_MMI #define HAS_BGRATOUVROW_MMI #define HAS_BGRATOYROW_MMI #define HAS_BLENDPLANEROW_MMI #define HAS_COMPUTECUMULATIVESUMROW_MMI #define HAS_CUMULATIVESUMTOAVERAGEROW_MMI #define HAS_HALFFLOATROW_MMI #define HAS_I400TOARGBROW_MMI #define HAS_I422TOUYVYROW_MMI #define HAS_I422TOYUY2ROW_MMI #define HAS_INTERPOLATEROW_MMI #define HAS_J400TOARGBROW_MMI #define HAS_MERGERGBROW_MMI #define HAS_MERGEUVROW_MMI #define HAS_MIRRORROW_MMI #define HAS_MIRRORSPLITUVROW_MMI #define HAS_RAWTOARGBROW_MMI #define HAS_RAWTORGB24ROW_MMI #define HAS_RAWTOUVROW_MMI #define HAS_RAWTOYROW_MMI #define HAS_RGB24TOARGBROW_MMI #define HAS_RGB24TOUVROW_MMI #define HAS_RGB24TOYROW_MMI #define HAS_RGB565TOARGBROW_MMI #define HAS_RGB565TOUVROW_MMI #define HAS_RGB565TOYROW_MMI #define HAS_RGBATOUVROW_MMI #define HAS_RGBATOYROW_MMI #define HAS_SOBELROW_MMI #define HAS_SOBELTOPLANEROW_MMI #define HAS_SOBELXROW_MMI #define HAS_SOBELXYROW_MMI #define HAS_SOBELYROW_MMI #define HAS_SPLITRGBROW_MMI #define HAS_SPLITUVROW_MMI #define HAS_UYVYTOUVROW_MMI #define HAS_UYVYTOYROW_MMI #define HAS_YUY2TOUV422ROW_MMI #define HAS_YUY2TOUVROW_MMI #define HAS_YUY2TOYROW_MMI #endif #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__) #if defined(VISUALC_HAS_AVX2) #define SIMD_ALIGNED(var) __declspec(align(32)) var #else #define SIMD_ALIGNED(var) __declspec(align(16)) var #endif #define LIBYUV_NOINLINE __declspec(noinline) typedef __declspec(align(16)) int16_t vec16[8]; typedef __declspec(align(16)) int32_t vec32[4]; typedef __declspec(align(16)) float vecf32[4]; typedef __declspec(align(16)) int8_t vec8[16]; typedef __declspec(align(16)) uint16_t uvec16[8]; typedef __declspec(align(16)) uint32_t uvec32[4]; typedef __declspec(align(16)) uint8_t uvec8[16]; typedef __declspec(align(32)) int16_t lvec16[16]; typedef __declspec(align(32)) int32_t lvec32[8]; typedef __declspec(align(32)) int8_t lvec8[32]; typedef __declspec(align(32)) uint16_t ulvec16[16]; typedef __declspec(align(32)) uint32_t ulvec32[8]; typedef __declspec(align(32)) uint8_t ulvec8[32]; #elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__)) // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. #if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2) #define SIMD_ALIGNED(var) var __attribute__((aligned(32))) #else #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #endif #define LIBYUV_NOINLINE __attribute__((noinline)) typedef int16_t __attribute__((vector_size(16))) vec16; typedef int32_t __attribute__((vector_size(16))) vec32; typedef float __attribute__((vector_size(16))) vecf32; typedef int8_t __attribute__((vector_size(16))) vec8; typedef uint16_t __attribute__((vector_size(16))) uvec16; typedef uint32_t __attribute__((vector_size(16))) uvec32; typedef uint8_t __attribute__((vector_size(16))) uvec8; typedef int16_t __attribute__((vector_size(32))) lvec16; typedef int32_t __attribute__((vector_size(32))) lvec32; typedef int8_t __attribute__((vector_size(32))) lvec8; typedef uint16_t __attribute__((vector_size(32))) ulvec16; typedef uint32_t __attribute__((vector_size(32))) ulvec32; typedef uint8_t __attribute__((vector_size(32))) ulvec8; #else #define SIMD_ALIGNED(var) var #define LIBYUV_NOINLINE typedef int16_t vec16[8]; typedef int32_t vec32[4]; typedef float vecf32[4]; typedef int8_t vec8[16]; typedef uint16_t uvec16[8]; typedef uint32_t uvec32[4]; typedef uint8_t uvec8[16]; typedef int16_t lvec16[16]; typedef int32_t lvec32[8]; typedef int8_t lvec8[32]; typedef uint16_t ulvec16[16]; typedef uint32_t ulvec32[8]; typedef uint8_t ulvec8[32]; #endif #if defined(__aarch64__) || defined(__arm__) // This struct is for ARM color conversion. struct YuvConstants { uvec8 kUVCoeff; vec16 kRGBCoeffBias; }; #else // This struct is for Intel color conversion. struct YuvConstants { uint8_t kUVToB[32]; uint8_t kUVToG[32]; uint8_t kUVToR[32]; int16_t kYToRgb[16]; int16_t kYBiasToRgb[16]; }; // Offsets into YuvConstants structure #define KUVTOB 0 #define KUVTOG 32 #define KUVTOR 64 #define KYTORGB 96 #define KYBIASTORGB 128 #endif #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) #define align_buffer_64(var, size) \ uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63)); /* NOLINT */ \ uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */ #define free_aligned_buffer_64(var) \ free(var##_mem); \ var = 0 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__) #define OMITFP #else #define OMITFP __attribute__((optimize("omit-frame-pointer"))) #endif // NaCL macros for GCC x86 and x64. #if defined(__native_client__) #define LABELALIGN ".p2align 5\n" #else #define LABELALIGN #endif // Intel Code Analizer markers. Insert IACA_START IACA_END around code to be // measured and then run with iaca -64 libyuv_unittest. // IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within // inline assembly blocks. // example of iaca: // ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest #if defined(__x86_64__) || defined(__i386__) #define IACA_ASM_START \ ".byte 0x0F, 0x0B\n" \ " movl $111, %%ebx\n" \ ".byte 0x64, 0x67, 0x90\n" #define IACA_ASM_END \ " movl $222, %%ebx\n" \ ".byte 0x64, 0x67, 0x90\n" \ ".byte 0x0F, 0x0B\n" #define IACA_SSC_MARK(MARK_ID) \ __asm__ __volatile__("\n\t movl $" #MARK_ID \ ", %%ebx" \ "\n\t .byte 0x64, 0x67, 0x90" \ : \ : \ : "memory"); #define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B"); #else /* Visual C */ #define IACA_UD_BYTES \ { __asm _emit 0x0F __asm _emit 0x0B } #define IACA_SSC_MARK(x) \ { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 } #define IACA_VC64_START __writegsbyte(111, 111); #define IACA_VC64_END __writegsbyte(222, 222); #endif #define IACA_START \ { \ IACA_UD_BYTES \ IACA_SSC_MARK(111) \ } #define IACA_END \ { \ IACA_SSC_MARK(222) \ IACA_UD_BYTES \ } void I444ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I444AlphaToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422AlphaToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB24Row_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); void NV21ToRGB24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width); void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422AlphaToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_MSA(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width); void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGBAToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width); void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width); void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width); void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width); void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToYJRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUV444Row_MSA(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_MSA(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUV444Row_MMI(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_MMI(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_NEON(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_NEON(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVRow_NEON(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVJRow_NEON(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB565ToUVRow_MMI(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width); void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width); void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width); void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width); void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width); void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width); void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width); void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width); void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width); void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width); void ARGBToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void ARGBToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGBAToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void BGRAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void ABGRToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGB24ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGB24ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RAWToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RAWToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void BGRAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_AVX2(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUV444Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVJRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVJRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB565ToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB1555ToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void BGRAToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void ABGRToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RGBAToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB24ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RAWToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width); void RGB565ToUVRow_C(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBToUV444Row_C(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width); void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorSplitUVRow_SSSE3(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, int width); void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); void MirrorSplitUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); void MirrorSplitUVRow_MMI(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); void MirrorSplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); void SplitUVRow_SSE2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); void SplitUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); void SplitUVRow_MMI(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void SplitUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void SplitUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void SplitUVRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void SplitUVRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void MergeUVRow_C(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); void MergeUVRow_SSE2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); void MergeUVRow_MSA(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); void MergeUVRow_MMI(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width); void MergeUVRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void MergeUVRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void MergeUVRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void MergeUVRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void MergeUVRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void HalfMergeUVRow_C(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int width); void HalfMergeUVRow_NEON(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int width); void HalfMergeUVRow_SSSE3(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int width); void HalfMergeUVRow_AVX2(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int width); void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitRGBRow_SSSE3(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitRGBRow_MMI(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitRGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitRGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void MergeRGBRow_C(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width); void MergeRGBRow_SSSE3(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width); void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width); void MergeRGBRow_MMI(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width); void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void MergeRGBRow_Any_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width); void MergeRGBRow_Any_MMI(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width); void MergeARGBRow_C(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, uint8_t* dst_argb, int width); void MergeARGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, uint8_t* dst_argb, int width); void MergeARGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, uint8_t* dst_argb, int width); void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, uint8_t* dst_argb, int width); void MergeARGBRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, int width); void MergeARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, int width); void MergeARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, int width); void SplitARGBRow_C(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); void SplitARGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); void SplitARGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); void SplitARGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); void SplitARGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); void SplitARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); void SplitARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); void SplitARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); void SplitARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width); void MergeXRGBRow_C(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_argb, int width); void MergeXRGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_argb, int width); void MergeXRGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_argb, int width); void MergeXRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_argb, int width); void MergeXRGBRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void MergeXRGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void MergeXRGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void SplitXRGBRow_C(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitXRGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitXRGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitXRGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitXRGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitXRGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitXRGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitXRGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void SplitXRGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width); void MergeXR30Row_C(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_ar30, int depth, int width); void MergeAR64Row_C(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint16_t* dst_ar64, int depth, int width); void MergeARGB16To8Row_C(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint8_t* dst_argb, int depth, int width); void MergeXR64Row_C(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint16_t* dst_ar64, int depth, int width); void MergeXRGB16To8Row_C(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_argb, int depth, int width); void MergeXR30Row_AVX2(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_ar30, int depth, int width); void MergeAR64Row_AVX2(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint16_t* dst_ar64, int depth, int width); void MergeARGB16To8Row_AVX2(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint8_t* dst_argb, int depth, int width); void MergeXR64Row_AVX2(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint16_t* dst_ar64, int depth, int width); void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_argb, int depth, int width); void MergeXR30Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_ar30, int depth, int width); void MergeXR30Row_10_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_ar30, int /* depth */, int width); void MergeAR64Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint16_t* dst_ar64, int depth, int width); void MergeARGB16To8Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint8_t* dst_argb, int depth, int width); void MergeXR64Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint16_t* dst_ar64, int depth, int width); void MergeXRGB16To8Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_argb, int depth, int width); void MergeXR30Row_Any_AVX2(const uint16_t* r_buf, const uint16_t* g_buf, const uint16_t* b_buf, uint8_t* dst_ptr, int depth, int width); void MergeAR64Row_Any_AVX2(const uint16_t* r_buf, const uint16_t* g_buf, const uint16_t* b_buf, const uint16_t* a_buf, uint16_t* dst_ptr, int depth, int width); void MergeXR64Row_Any_AVX2(const uint16_t* r_buf, const uint16_t* g_buf, const uint16_t* b_buf, uint16_t* dst_ptr, int depth, int width); void MergeARGB16To8Row_Any_AVX2(const uint16_t* r_buf, const uint16_t* g_buf, const uint16_t* b_buf, const uint16_t* a_buf, uint8_t* dst_ptr, int depth, int width); void MergeXRGB16To8Row_Any_AVX2(const uint16_t* r_buf, const uint16_t* g_buf, const uint16_t* b_buf, uint8_t* dst_ptr, int depth, int width); void MergeXR30Row_Any_NEON(const uint16_t* r_buf, const uint16_t* g_buf, const uint16_t* b_buf, uint8_t* dst_ptr, int depth, int width); void MergeXR30Row_10_Any_NEON(const uint16_t* r_buf, const uint16_t* g_buf, const uint16_t* b_buf, uint8_t* dst_ptr, int depth, int width); void MergeAR64Row_Any_NEON(const uint16_t* r_buf, const uint16_t* g_buf, const uint16_t* b_buf, const uint16_t* a_buf, uint16_t* dst_ptr, int depth, int width); void MergeARGB16To8Row_Any_NEON(const uint16_t* r_buf, const uint16_t* g_buf, const uint16_t* b_buf, const uint16_t* a_buf, uint8_t* dst_ptr, int depth, int width); void MergeXR64Row_Any_NEON(const uint16_t* r_buf, const uint16_t* g_buf, const uint16_t* b_buf, uint16_t* dst_ptr, int depth, int width); void MergeXRGB16To8Row_Any_NEON(const uint16_t* r_buf, const uint16_t* g_buf, const uint16_t* b_buf, uint8_t* dst_ptr, int depth, int width); void MergeUVRow_16_C(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, int depth, int width); void MergeUVRow_16_AVX2(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, int depth, int width); void MergeUVRow_16_Any_AVX2(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, int depth, int width); void MergeUVRow_16_NEON(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, int depth, int width); void MergeUVRow_16_Any_NEON(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, int depth, int width); void SplitUVRow_16_C(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width); void SplitUVRow_16_AVX2(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width); void SplitUVRow_16_Any_AVX2(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width); void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width); void SplitUVRow_16_Any_NEON(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width); void MultiplyRow_16_C(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); void MultiplyRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); void MultiplyRow_16_Any_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int scale, int width); void MultiplyRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); void MultiplyRow_16_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int scale, int width); void DivideRow_16_C(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); void DivideRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); void DivideRow_16_Any_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int scale, int width); void DivideRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width); void DivideRow_16_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int scale, int width); void Convert8To16Row_C(const uint8_t* src_y, uint16_t* dst_y, int scale, int width); void Convert8To16Row_SSE2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width); void Convert8To16Row_AVX2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width); void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int scale, int width); void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int scale, int width); void Convert16To8Row_C(const uint16_t* src_y, uint8_t* dst_y, int scale, int width); void Convert16To8Row_SSSE3(const uint16_t* src_y, uint8_t* dst_y, int scale, int width); void Convert16To8Row_AVX2(const uint16_t* src_y, uint8_t* dst_y, int scale, int width); void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr, uint8_t* dst_ptr, int scale, int width); void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr, uint8_t* dst_ptr, int scale, int width); void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width); void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width); void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count); void CopyRow_C(const uint8_t* src, uint8_t* dst, int count); void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count); void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBCopyAlphaRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width); void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, uint8_t* dst_a, int width); void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width); void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width); void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, uint8_t* dst_a, int width); void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb, uint8_t* dst_a, int width); void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBExtractAlphaRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width); void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBCopyYToAlphaRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void SetRow_C(uint8_t* dst, uint8_t v8, int width); void SetRow_MSA(uint8_t* dst, uint8_t v8, int width); void SetRow_X86(uint8_t* dst, uint8_t v8, int width); void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width); void SetRow_NEON(uint8_t* dst, uint8_t v8, int width); void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width); void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width); void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width); void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width); void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width); void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width); void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width); void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width); void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width); void ARGBSetRow_Any_MMI(uint8_t* dst_ptr, uint32_t v32, int width); // ARGBShufflers for BGRAToARGB etc. void ARGBShuffleRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); void ARGBShuffleRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); void ARGBShuffleRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width); void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, int width); void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, int width); void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, int width); void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, int width); void ARGBShuffleRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint8_t* param, int width); void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int width); void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_argb, int width); void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width); void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_argb, int width); void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_argb, int width); void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width); void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, uint8_t* dst_argb, int width); void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_argb, int width); void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width); void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, uint8_t* dst_argb, int width); void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, uint8_t* dst_argb, int width); void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width); void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width); void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width); void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGBARow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGBARow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB565ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB1555ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGB4444ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width); void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width); void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width); void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width); void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width); void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, uint8_t* dst, const uint32_t dither4, int width); void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, const uint32_t dither4, int width); void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width); void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width); void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width); void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb565, int width); void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb1555, int width); void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width); void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width); void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width); void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToARGB1555Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width); void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width); void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width); void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width); void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width); void AR64ShuffleRow_C(const uint8_t* src_ar64, uint8_t* dst_ar64, const uint8_t* shuffler, int width); void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ar64, int width); void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ab64, int width); void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, uint8_t* dst_argb, int width); void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, uint8_t* dst_argb, int width); void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width); void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width); void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width); void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width); void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width); void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width); void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width); void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width); void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr, uint16_t* dst_ptr, int width); void ARGBToAB64Row_Any_SSSE3(const uint8_t* src_ptr, uint16_t* dst_ptr, int width); void AR64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, uint8_t* dst_ptr, int width); void AB64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToAR64Row_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int width); void ARGBToAB64Row_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int width); void AR64ToARGBRow_Any_AVX2(const uint16_t* src_ptr, uint8_t* dst_ptr, int width); void AB64ToARGBRow_Any_AVX2(const uint16_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToAR64Row_Any_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int width); void ARGBToAB64Row_Any_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int width); void AR64ToARGBRow_Any_NEON(const uint16_t* src_ptr, uint8_t* dst_ptr, int width); void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr, uint8_t* dst_ptr, int width); void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void J400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void I444ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I422ToAR30Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I210ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I210ToARGBRow_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I212ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I212ToARGBRow_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I410ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I410ToARGBRow_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I210AlphaToARGBRow_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, const uint16_t* src_a, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I410AlphaToARGBRow_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, const uint16_t* src_a, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I444AlphaToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I422AlphaToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_C(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB24Row_C(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void NV21ToRGB24Row_C(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void NV21ToYUV24Row_C(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width); void YUY2ToARGBRow_C(const uint8_t* src_yuy2, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_C(const uint8_t* src_uyvy, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void P210ToARGBRow_C(const uint16_t* src_y, const uint16_t* src_uv, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void P410ToARGBRow_C(const uint16_t* src_y, const uint16_t* src_uv, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void P210ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_uv, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void P410ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_uv, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToAR30Row_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void I210ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void I210ToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I212ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void I212ToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I410ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void I410ToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToAR30Row_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void I210ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I210ToAR30Row_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void I212ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I212ToAR30Row_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void I410ToAR30Row_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void I410ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); void NV21ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width); void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width); void NV12ToRGB565Row_AVX2(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* vu_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* vu_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void P210ToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void P410ToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void P210ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void P410ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void P210ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void P410ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void P210ToAR30Row_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void P410ToAR30Row_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I212ToAR30Row_Any_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I212ToARGBRow_Any_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I410ToAR30Row_Any_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I410ToARGBRow_Any_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I210AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I410AlphaToARGBRow_Any_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I212ToARGBRow_Any_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I212ToAR30Row_Any_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I410ToAR30Row_Any_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I410ToARGBRow_Any_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I210AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I410AlphaToARGBRow_Any_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I444AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I444AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV21ToYUV24Row_Any_SSSE3(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width); void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width); void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void P210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void P410ToARGBRow_Any_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void P210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void P410ToAR30Row_Any_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void P210ToARGBRow_Any_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void P410ToARGBRow_Any_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void P210ToAR30Row_Any_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void P410ToAR30Row_Any_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* param, int width); void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* param, int width); void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* param, int width); void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); // ARGB preattenuated alpha blend. void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBBlendRow_MSA(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBBlendRow_MMI(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBBlendRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); // Unattenuated planar alpha blend. void BlendPlaneRow_SSSE3(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width); void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void BlendPlaneRow_AVX2(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width); void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void BlendPlaneRow_MMI(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width); void BlendPlaneRow_Any_MMI(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void BlendPlaneRow_C(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width); // ARGB multiply images. Same API as Blend, but these require // pointer and width alignment for SSE2. void ARGBMultiplyRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBMultiplyRow_MSA(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBMultiplyRow_MMI(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBMultiplyRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); // ARGB add images. void ARGBAddRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBAddRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBAddRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBAddRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBAddRow_MSA(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBAddRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBAddRow_MMI(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBAddRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); // ARGB subtract images. Same API as Blend, but these require // pointer and width alignment for SSE2. void ARGBSubtractRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBSubtractRow_MSA(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBSubtractRow_MMI(const uint8_t* src_argb0, const uint8_t* src_argb1, uint8_t* dst_argb, int width); void ARGBSubtractRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint32_t param, int width); void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint32_t param, int width); void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint32_t param, int width); void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint32_t param, int width); void ARGBToRGB24Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRAWRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB565Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToARGB1555Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToARGB4444Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToRGB565DitherRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, const uint32_t param, int width); void I444ToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I444AlphaToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV21ToYUV24Row_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void P210ToARGBRow_NEON(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void P410ToARGBRow_NEON(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void P210ToAR30Row_NEON(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void P410ToAR30Row_NEON(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void P210ToARGBRow_Any_NEON(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void P410ToARGBRow_Any_NEON(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void P210ToAR30Row_Any_NEON(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void P410ToAR30Row_Any_NEON(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I444ToARGBRow_Any_MMI(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGBRow_Any_MMI(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width); void YUY2ToUVRow_C(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUV422Row_C(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void YUY2ToUV422Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, int stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, int stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, int stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_NEON(const uint8_t* src_uyvy, int stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_MSA(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUVRow_MMI(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width); void UYVYToUVRow_C(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_C(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToYRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUVRow_Any_MMI(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, int width); void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToUVRow_C(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width); void AYUVToVURow_C(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_vu, int width); void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width); void AYUVToVURow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_vu, int width); void AYUVToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void AYUVToUVRow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, int width); void AYUVToVURow_Any_NEON(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, int width); void I422ToYUY2Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_frame, int width); void I422ToUYVYRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_frame, int width); void I422ToYUY2Row_SSE2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width); void I422ToUYVYRow_SSE2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width); void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void I422ToYUY2Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width); void I422ToUYVYRow_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width); void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void I422ToYUY2Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width); void I422ToUYVYRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width); void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void I422ToYUY2Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width); void I422ToYUY2Row_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width); void I422ToUYVYRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width); void I422ToUYVYRow_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width); void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void I422ToYUY2Row_Any_MMI(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); void I422ToUYVYRow_Any_MMI(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, int width); // Effects related row functions. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBAttenuateRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBAttenuateRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBAttenuateRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); // Inverse table for unattenuate, shared by C and SSE2. extern const uint32_t fixed_invtbl8[256]; void ARGBUnattenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width); void ARGBSepiaRow_C(uint8_t* dst_argb, int width); void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width); void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width); void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width); void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width); void ARGBColorMatrixRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width); void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width); void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width); void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width); void ARGBColorMatrixRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width); void ARGBColorTableRow_C(uint8_t* dst_argb, const uint8_t* table_argb, int width); void ARGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width); void RGBColorTableRow_C(uint8_t* dst_argb, const uint8_t* table_argb, int width); void RGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width); void ARGBQuantizeRow_C(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width); void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width); void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width); void ARGBQuantizeRow_MSA(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width); void ARGBShadeRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value); void ARGBShadeRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value); void ARGBShadeRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value); void ARGBShadeRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value); void ARGBShadeRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value); // Used for blur. void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, const int32_t* botleft, int width, int area, uint8_t* dst, int count); void ComputeCumulativeSumRow_SSE2(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width); void ComputeCumulativeSumRow_MMI(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width); void CumulativeSumToAverageRow_C(const int32_t* tl, const int32_t* bl, int w, int area, uint8_t* dst, int count); void ComputeCumulativeSumRow_C(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width); LIBYUV_API void ARGBAffineRow_C(const uint8_t* src_argb, int src_argb_stride, uint8_t* dst_argb, const float* uv_dudv, int width); LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, int src_argb_stride, uint8_t* dst_argb, const float* src_dudv, int width); // Used for I420Scale, ARGBScale, and ARGBInterpolate. void InterpolateRow_C(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); void InterpolateRow_SSSE3(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction); void InterpolateRow_AVX2(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction); void InterpolateRow_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction); void InterpolateRow_MSA(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); void InterpolateRow_MMI(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); void InterpolateRow_Any_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); void InterpolateRow_Any_AVX2(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); void InterpolateRow_Any_MSA(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); void InterpolateRow_Any_MMI(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride_ptr, int width, int source_y_fraction); void InterpolateRow_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); // Sobel images. void SobelXRow_C(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int width); void SobelXRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int width); void SobelXRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int width); void SobelXRow_MSA(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int width); void SobelXRow_MMI(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int width); void SobelYRow_C(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width); void SobelYRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width); void SobelYRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width); void SobelYRow_MSA(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width); void SobelYRow_MMI(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width); void SobelRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); void SobelRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); void SobelRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); void SobelRow_MSA(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); void SobelRow_MMI(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); void SobelToPlaneRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width); void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width); void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width); void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width); void SobelToPlaneRow_MMI(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width); void SobelXYRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); void SobelXYRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); void SobelXYRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); void SobelXYRow_MSA(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); void SobelXYRow_MMI(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width); void SobelRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void SobelRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void SobelRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void SobelRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void SobelToPlaneRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void SobelXYRow_Any_SSE2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void SobelXYRow_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void SobelXYRow_Any_MSA(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void SobelXYRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, int width); void ARGBPolynomialRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width); void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width); void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width); // Scale and convert to half float. void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width); void HalfFloatRow_SSE2(const uint16_t* src, uint16_t* dst, float scale, int width); void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, float param, int width); void HalfFloatRow_AVX2(const uint16_t* src, uint16_t* dst, float scale, int width); void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, float param, int width); void HalfFloatRow_F16C(const uint16_t* src, uint16_t* dst, float scale, int width); void HalfFloatRow_Any_F16C(const uint16_t* src, uint16_t* dst, float scale, int width); void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float scale, int width); void HalfFloat1Row_Any_F16C(const uint16_t* src, uint16_t* dst, float scale, int width); void HalfFloatRow_NEON(const uint16_t* src, uint16_t* dst, float scale, int width); void HalfFloatRow_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, float param, int width); void HalfFloat1Row_NEON(const uint16_t* src, uint16_t* dst, float scale, int width); void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, float param, int width); void HalfFloatRow_MSA(const uint16_t* src, uint16_t* dst, float scale, int width); void HalfFloatRow_Any_MSA(const uint16_t* src_ptr, uint16_t* dst_ptr, float param, int width); void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width); void ByteToFloatRow_NEON(const uint8_t* src, float* dst, float scale, int width); void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr, float* dst_ptr, float param, int width); void ARGBLumaColorTableRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width, const uint8_t* luma, uint32_t lumacoeff); void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width, const uint8_t* luma, uint32_t lumacoeff); float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width); float ScaleMaxSamples_NEON(const float* src, float* dst, float scale, int width); float ScaleSumSamples_C(const float* src, float* dst, float scale, int width); float ScaleSumSamples_NEON(const float* src, float* dst, float scale, int width); void ScaleSamples_C(const float* src, float* dst, float scale, int width); void ScaleSamples_NEON(const float* src, float* dst, float scale, int width); void I210ToARGBRow_MMI(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422AlphaToARGBRow_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_MMI(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_MMI(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_MMI(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB24Row_MMI(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); void NV21ToRGB24Row_MMI(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); void I210ToARGBRow_Any_MMI(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGBARow_Any_MMI(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422AlphaToARGBRow_Any_MMI(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGB24Row_Any_MMI(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToRGB565Row_Any_MMI(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGB4444Row_Any_MMI(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void I422ToARGB1555Row_Any_MMI(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToARGBRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB565Row_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV21ToARGBRow_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV12ToRGB24Row_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void NV21ToRGB24Row_Any_MMI(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void YUY2ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void UYVYToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); void GaussRow_F32_NEON(const float* src, float* dst, int width); void GaussRow_F32_C(const float* src, float* dst, int width); void GaussCol_F32_NEON(const float* src0, const float* src1, const float* src2, const float* src3, const float* src4, float* dst, int width); void GaussCol_F32_C(const float* src0, const float* src1, const float* src2, const float* src3, const float* src4, float* dst, int width); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_ROW_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/scale.h000066400000000000000000000177761416500237200217450ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_SCALE_H_ #define INCLUDE_LIBYUV_SCALE_H_ #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Supported filtering. typedef enum FilterMode { kFilterNone = 0, // Point sample; Fastest. kFilterLinear = 1, // Filter horizontally only. kFilterBilinear = 2, // Faster than box, but lower quality scaling down. kFilterBox = 3 // Highest quality. } FilterModeEnum; // Scale a YUV plane. LIBYUV_API void ScalePlane(const uint8_t* src, int src_stride, int src_width, int src_height, uint8_t* dst, int dst_stride, int dst_width, int dst_height, enum FilterMode filtering); LIBYUV_API void ScalePlane_16(const uint16_t* src, int src_stride, int src_width, int src_height, uint16_t* dst, int dst_stride, int dst_width, int dst_height, enum FilterMode filtering); // Sample is expected to be in the low 12 bits. LIBYUV_API void ScalePlane_12(const uint16_t* src, int src_stride, int src_width, int src_height, uint16_t* dst, int dst_stride, int dst_width, int dst_height, enum FilterMode filtering); // Scales a YUV 4:2:0 image from the src width and height to the // dst width and height. // If filtering is kFilterNone, a simple nearest-neighbor algorithm is // used. This produces basic (blocky) quality at the fastest speed. // If filtering is kFilterBilinear, interpolation is used to produce a better // quality image, at the expense of speed. // If filtering is kFilterBox, averaging is used to produce ever better // quality image, at further expense of speed. // Returns 0 if successful. LIBYUV_API int I420Scale(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_width, int src_height, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering); LIBYUV_API int I420Scale_16(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, int src_width, int src_height, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering); LIBYUV_API int I420Scale_12(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, int src_width, int src_height, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering); // Scales a YUV 4:4:4 image from the src width and height to the // dst width and height. // If filtering is kFilterNone, a simple nearest-neighbor algorithm is // used. This produces basic (blocky) quality at the fastest speed. // If filtering is kFilterBilinear, interpolation is used to produce a better // quality image, at the expense of speed. // If filtering is kFilterBox, averaging is used to produce ever better // quality image, at further expense of speed. // Returns 0 if successful. LIBYUV_API int I444Scale(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_width, int src_height, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering); LIBYUV_API int I444Scale_16(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, int src_width, int src_height, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering); LIBYUV_API int I444Scale_12(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, int src_width, int src_height, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering); // Scales an NV12 image from the src width and height to the // dst width and height. // If filtering is kFilterNone, a simple nearest-neighbor algorithm is // used. This produces basic (blocky) quality at the fastest speed. // If filtering is kFilterBilinear, interpolation is used to produce a better // quality image, at the expense of speed. // kFilterBox is not supported for the UV channel and will be treated as // bilinear. // Returns 0 if successful. LIBYUV_API int NV12Scale(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, int src_width, int src_height, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int dst_width, int dst_height, enum FilterMode filtering); #ifdef __cplusplus // Legacy API. Deprecated. LIBYUV_API int Scale(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, int src_stride_y, int src_stride_u, int src_stride_v, int src_width, int src_height, uint8_t* dst_y, uint8_t* dst_u, uint8_t* dst_v, int dst_stride_y, int dst_stride_u, int dst_stride_v, int dst_width, int dst_height, LIBYUV_BOOL interpolate); // For testing, allow disabling of specialized scalers. LIBYUV_API void SetUseReferenceImpl(LIBYUV_BOOL use); #endif // __cplusplus #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_SCALE_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/scale_argb.h000066400000000000000000000046141416500237200227230ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ #define INCLUDE_LIBYUV_SCALE_ARGB_H_ #include "libyuv/basic_types.h" #include "libyuv/scale.h" // For FilterMode #ifdef __cplusplus namespace libyuv { extern "C" { #endif LIBYUV_API int ARGBScale(const uint8_t* src_argb, int src_stride_argb, int src_width, int src_height, uint8_t* dst_argb, int dst_stride_argb, int dst_width, int dst_height, enum FilterMode filtering); // Clipped scale takes destination rectangle coordinates for clip values. LIBYUV_API int ARGBScaleClip(const uint8_t* src_argb, int src_stride_argb, int src_width, int src_height, uint8_t* dst_argb, int dst_stride_argb, int dst_width, int dst_height, int clip_x, int clip_y, int clip_width, int clip_height, enum FilterMode filtering); // Scale with YUV conversion to ARGB and clipping. LIBYUV_API int YUVToARGBScaleClip(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint32_t src_fourcc, int src_width, int src_height, uint8_t* dst_argb, int dst_stride_argb, uint32_t dst_fourcc, int dst_width, int dst_height, int clip_x, int clip_y, int clip_width, int clip_height, enum FilterMode filtering); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/scale_row.h000066400000000000000000002337731416500237200226310ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ #define INCLUDE_LIBYUV_SCALE_ROW_H_ #include "libyuv/basic_types.h" #include "libyuv/scale.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #if defined(__pnacl__) || defined(__CLR_VER) || \ (defined(__native_client__) && defined(__x86_64__)) || \ (defined(__i386__) && !defined(__SSE__) && !defined(__clang__)) #define LIBYUV_DISABLE_X86 #endif #if defined(__native_client__) #define LIBYUV_DISABLE_NEON #endif // MemorySanitizer does not support assembly code yet. http://crbug.com/344505 #if defined(__has_feature) #if __has_feature(memory_sanitizer) #define LIBYUV_DISABLE_X86 #endif #endif // GCC >= 4.7.0 required for AVX2. #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7)) #define GCC_HAS_AVX2 1 #endif // GNUC >= 4.7 #endif // __GNUC__ // clang >= 3.4.0 required for AVX2. #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) #if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4)) #define CLANG_HAS_AVX2 1 #endif // clang >= 3.4 #endif // __clang__ // Visual C 2012 required for AVX2. #if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \ _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) #define HAS_FIXEDDIV1_X86 #define HAS_FIXEDDIV_X86 #define HAS_SCALEADDROW_SSE2 #define HAS_SCALEARGBCOLS_SSE2 #define HAS_SCALEARGBCOLSUP2_SSE2 #define HAS_SCALEARGBFILTERCOLS_SSSE3 #define HAS_SCALEARGBROWDOWN2_SSE2 #define HAS_SCALEARGBROWDOWNEVEN_SSE2 #define HAS_SCALECOLSUP2_SSE2 #define HAS_SCALEFILTERCOLS_SSSE3 #define HAS_SCALEROWDOWN2_SSSE3 #define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEROWDOWN38_SSSE3 #define HAS_SCALEROWDOWN4_SSSE3 #endif // The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #define HAS_SCALEUVROWDOWN2BOX_SSSE3 #define HAS_SCALEROWUP2LINEAR_SSE2 #define HAS_SCALEROWUP2LINEAR_SSSE3 #define HAS_SCALEROWUP2BILINEAR_SSE2 #define HAS_SCALEROWUP2BILINEAR_SSSE3 #define HAS_SCALEROWUP2LINEAR_12_SSSE3 #define HAS_SCALEROWUP2BILINEAR_12_SSSE3 #define HAS_SCALEROWUP2LINEAR_16_SSE2 #define HAS_SCALEROWUP2BILINEAR_16_SSE2 #define HAS_SCALEUVROWUP2LINEAR_SSSE3 #define HAS_SCALEUVROWUP2BILINEAR_SSSE3 #define HAS_SCALEUVROWUP2LINEAR_16_SSE2 #define HAS_SCALEUVROWUP2BILINEAR_16_SSE2 #endif // The following are available for gcc/clang x86 platforms, but // require clang 3.4 or gcc 4.7. // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || defined(__i386__)) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_SCALEUVROWDOWN2BOX_AVX2 #define HAS_SCALEROWUP2LINEAR_AVX2 #define HAS_SCALEROWUP2BILINEAR_AVX2 #define HAS_SCALEROWUP2LINEAR_12_AVX2 #define HAS_SCALEROWUP2BILINEAR_12_AVX2 #define HAS_SCALEROWUP2LINEAR_16_AVX2 #define HAS_SCALEROWUP2BILINEAR_16_AVX2 #define HAS_SCALEUVROWUP2LINEAR_AVX2 #define HAS_SCALEUVROWUP2BILINEAR_AVX2 #define HAS_SCALEUVROWUP2LINEAR_16_AVX2 #define HAS_SCALEUVROWUP2BILINEAR_16_AVX2 #endif // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. // The code supports NaCL but requires a new compiler and validator. #if !defined(LIBYUV_DISABLE_X86) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ defined(GCC_HAS_AVX2)) #define HAS_SCALEADDROW_AVX2 #define HAS_SCALEROWDOWN2_AVX2 #define HAS_SCALEROWDOWN4_AVX2 #endif // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SCALEADDROW_NEON #define HAS_SCALEARGBCOLS_NEON #define HAS_SCALEARGBFILTERCOLS_NEON #define HAS_SCALEARGBROWDOWN2_NEON #define HAS_SCALEARGBROWDOWNEVEN_NEON #define HAS_SCALEFILTERCOLS_NEON #define HAS_SCALEROWDOWN2_NEON #define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN38_NEON #define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEUVROWDOWN2BOX_NEON #define HAS_SCALEUVROWDOWNEVEN_NEON #define HAS_SCALEROWUP2LINEAR_NEON #define HAS_SCALEROWUP2BILINEAR_NEON #define HAS_SCALEROWUP2LINEAR_12_NEON #define HAS_SCALEROWUP2BILINEAR_12_NEON #define HAS_SCALEROWUP2LINEAR_16_NEON #define HAS_SCALEROWUP2BILINEAR_16_NEON #define HAS_SCALEUVROWUP2LINEAR_NEON #define HAS_SCALEUVROWUP2BILINEAR_NEON #define HAS_SCALEUVROWUP2LINEAR_16_NEON #define HAS_SCALEUVROWUP2BILINEAR_16_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_SCALEADDROW_MSA #define HAS_SCALEARGBCOLS_MSA #define HAS_SCALEARGBFILTERCOLS_MSA #define HAS_SCALEARGBROWDOWN2_MSA #define HAS_SCALEARGBROWDOWNEVEN_MSA #define HAS_SCALEFILTERCOLS_MSA #define HAS_SCALEROWDOWN2_MSA #define HAS_SCALEROWDOWN34_MSA #define HAS_SCALEROWDOWN38_MSA #define HAS_SCALEROWDOWN4_MSA #endif #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #define HAS_FIXEDDIV1_MIPS #define HAS_FIXEDDIV_MIPS #define HAS_SCALEADDROW_16_MMI #define HAS_SCALEADDROW_MMI #define HAS_SCALEARGBCOLS_MMI #define HAS_SCALEARGBCOLSUP2_MMI #define HAS_SCALEARGBROWDOWN2_MMI #define HAS_SCALEARGBROWDOWNEVEN_MMI #define HAS_SCALECOLS_16_MMI #define HAS_SCALECOLS_MMI #define HAS_SCALEROWDOWN2_16_MMI #define HAS_SCALEROWDOWN2_MMI #define HAS_SCALEROWDOWN4_16_MMI #define HAS_SCALEROWDOWN4_MMI #define HAS_SCALEROWDOWN34_MMI #endif // Scale ARGB vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_argb, uint8_t* dst_argb, int x, int y, int dy, int bpp, enum FilterMode filtering); void ScalePlaneVertical_16(int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_argb, uint16_t* dst_argb, int x, int y, int dy, int wpp, enum FilterMode filtering); // Simplify the filtering based on scale factors. enum FilterMode ScaleFilterReduce(int src_width, int src_height, int dst_width, int dst_height, enum FilterMode filtering); // Divide num by div and return as 16.16 fixed point result. int FixedDiv_C(int num, int div); int FixedDiv_X86(int num, int div); int FixedDiv_MIPS(int num, int div); // Divide num - 1 by div - 1 and return as 16.16 fixed point result. int FixedDiv1_C(int num, int div); int FixedDiv1_X86(int num, int div); int FixedDiv1_MIPS(int num, int div); #ifdef HAS_FIXEDDIV_X86 #define FixedDiv FixedDiv_X86 #define FixedDiv1 FixedDiv1_X86 #elif defined HAS_FIXEDDIV_MIPS #define FixedDiv FixedDiv_MIPS #define FixedDiv1 FixedDiv1_MIPS #else #define FixedDiv FixedDiv_C #define FixedDiv1 FixedDiv1_C #endif // Compute slope values for stepping. void ScaleSlope(int src_width, int src_height, int dst_width, int dst_height, enum FilterMode filtering, int* x, int* y, int* dx, int* dy); void ScaleRowDown2_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown2_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleRowDown2Linear_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleRowDown2Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleRowDown4_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown4_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleRowDown4Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleRowDown34_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown34_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* d, int dst_width); void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* d, int dst_width); void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* d, int dst_width); void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* d, int dst_width); void ScaleRowUp2_Linear_C(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_Any_C(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_Any_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_16_Any_C(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleCols_C(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleCols_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx); void ScaleColsUp2_C(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int, int); void ScaleColsUp2_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int, int); void ScaleFilterCols_C(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleFilterCols_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx); void ScaleFilterCols64_C(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x32, int dx); void ScaleFilterCols64_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x32, int dx); void ScaleRowDown38_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown38_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width); void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width); void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleAddRow_16_C(const uint16_t* src_ptr, uint32_t* dst_ptr, int src_width); void ScaleARGBRowDown2_C(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDownEven_C(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width); void ScaleARGBCols_C(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); void ScaleARGBCols64_C(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x32, int dx); void ScaleARGBColsUp2_C(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int, int); void ScaleARGBFilterCols_C(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); void ScaleARGBFilterCols64_C(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x32, int dx); void ScaleUVRowDown2_C(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2Linear_C(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2Box_C(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDownEven_C(const uint8_t* src_uv, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_Any_C(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVCols_C(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x, int dx); void ScaleUVCols64_C(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x32, int dx); void ScaleUVColsUp2_C(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int, int); void ScaleUVFilterCols_C(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x, int dx); void ScaleUVFilterCols64_C(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x32, int dx); // Specialized scalers for x86. void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_12_Any_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_12_Any_SSSE3(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_12_Any_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_12_Any_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleColsUp2_SSE2(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); // ARGB Column functions void ScaleARGBCols_SSE2(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); void ScaleARGBCols_NEON(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); void ScaleARGBCols_MSA(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleARGBCols_MMI(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); void ScaleARGBCols_Any_MMI(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); // ARGB Row functions void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2_MMI(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDown2_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDown2Linear_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDown2Box_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, int32_t src_stepx, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb, ptrdiff_t src_stride, int32_t src_stepx, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width); void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, int32_t src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDownEven_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, int32_t src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleARGBRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); // UV Row functions void ScaleUVRowDown2_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2Linear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleUVRowDown2_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2Linear_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2Linear_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2Box_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width); void ScaleUVRowDown2_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2Box_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2Box_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2Linear_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2Box_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2Linear_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2Box_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2Linear_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDown2Box_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDownEven_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); void ScaleUVRowDownEvenBox_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); void ScaleUVRowDownEvenBox_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); void ScaleUVRowDownEven_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, int32_t src_stepx, uint8_t* dst_uv, int dst_width); void ScaleUVRowDownEvenBox_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); void ScaleUVRowDownEven_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, int32_t src_stepx, uint8_t* dst_uv, int dst_width); void ScaleUVRowDownEvenBox_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width); void ScaleUVRowDownEven_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDownEvenBox_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDownEven_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDownEvenBox_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDownEven_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, int32_t src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDownEvenBox_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDownEven_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, int32_t src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleUVRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleUVRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); // ScaleRowDown2Box also used by planar functions // NEON downscalers with interpolation. // Note - not static due to reuse in convert for 444 to 420. void ScaleRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown4_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); // 32 -> 12 void ScaleRowDown38_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); // 32x3 -> 12x1 void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); // 32x2 -> 12x1 void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); // 32 -> 12 void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); // 32x3 -> 12x1 void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); // 32x2 -> 12x1 void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_12_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_12_Any_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width); void ScaleRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width); void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleAddRow_Any_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleFilterCols_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleRowDown2_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown4_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown38_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleFilterCols_MSA(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleRowDown34_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown34_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* d, int dst_width); void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* d, int dst_width); void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleAddRow_Any_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown2_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleRowDown2Box_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleRowDown2Box_Odd_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown4_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown4_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleRowDown4Box_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width); void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); void ScaleAddRow_16_MMI(const uint16_t* src_ptr, uint32_t* dst_ptr, int src_width); void ScaleColsUp2_MMI(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx); void ScaleColsUp2_16_MMI(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx); void ScaleARGBColsUp2_MMI(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx); void ScaleRowDown2_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Linear_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown2Box_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleRowDown4Box_Any_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void ScaleAddRow_Any_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_SCALE_ROW_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/scale_uv.h000066400000000000000000000026201416500237200224350ustar00rootroot00000000000000/* * Copyright 2020 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_SCALE_UV_H_ #define INCLUDE_LIBYUV_SCALE_UV_H_ #include "libyuv/basic_types.h" #include "libyuv/scale.h" // For FilterMode #ifdef __cplusplus namespace libyuv { extern "C" { #endif LIBYUV_API int UVScale(const uint8_t* src_uv, int src_stride_uv, int src_width, int src_height, uint8_t* dst_uv, int dst_stride_uv, int dst_width, int dst_height, enum FilterMode filtering); // Scale a 16 bit UV image. // This function is currently incomplete, it can't handle all cases. LIBYUV_API int UVScale_16(const uint16_t* src_uv, int src_stride_uv, int src_width, int src_height, uint16_t* dst_uv, int dst_stride_uv, int dst_width, int dst_height, enum FilterMode filtering); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_SCALE_UV_H_ libyuv-0.0~git20220104.b91df1a/include/libyuv/version.h000066400000000000000000000010371416500237200223220ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ #define LIBYUV_VERSION 1807 #endif // INCLUDE_LIBYUV_VERSION_H_libyuv-0.0~git20220104.b91df1a/include/libyuv/video_common.h000066400000000000000000000211701416500237200233130ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ // Common definitions for video, including fourcc and VideoFormat. #ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ #define INCLUDE_LIBYUV_VIDEO_COMMON_H_ #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif ////////////////////////////////////////////////////////////////////////////// // Definition of FourCC codes ////////////////////////////////////////////////////////////////////////////// // Convert four characters to a FourCC code. // Needs to be a macro otherwise the OS X compiler complains when the kFormat* // constants are used in a switch. #ifdef __cplusplus #define FOURCC(a, b, c, d) \ ((static_cast(a)) | (static_cast(b) << 8) | \ (static_cast(c) << 16) | /* NOLINT */ \ (static_cast(d) << 24)) /* NOLINT */ #else #define FOURCC(a, b, c, d) \ (((uint32_t)(a)) | ((uint32_t)(b) << 8) | /* NOLINT */ \ ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */ #endif // Some pages discussing FourCC codes: // http://www.fourcc.org/yuv.php // http://v4l2spec.bytesex.org/spec/book1.htm // http://developer.apple.com/quicktime/icefloe/dispatch020.html // http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12 // http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt // FourCC codes grouped according to implementation efficiency. // Primary formats should convert in 1 efficient step. // Secondary formats are converted in 2 steps. // Auxilliary formats call primary converters. enum FourCC { // 10 Primary YUV formats: 5 planar, 2 biplanar, 2 packed. FOURCC_I420 = FOURCC('I', '4', '2', '0'), FOURCC_I422 = FOURCC('I', '4', '2', '2'), FOURCC_I444 = FOURCC('I', '4', '4', '4'), FOURCC_I400 = FOURCC('I', '4', '0', '0'), FOURCC_NV21 = FOURCC('N', 'V', '2', '1'), FOURCC_NV12 = FOURCC('N', 'V', '1', '2'), FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'), FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'), FOURCC_I010 = FOURCC('I', '0', '1', '0'), // bt.601 10 bit 420 FOURCC_I210 = FOURCC('I', '2', '1', '0'), // bt.601 10 bit 422 // 1 Secondary YUV format: row biplanar. deprecated. FOURCC_M420 = FOURCC('M', '4', '2', '0'), // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010. FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit FOURCC_AR64 = FOURCC('A', 'R', '6', '4'), // 16 bit per channel. FOURCC_AB64 = FOURCC('A', 'B', '6', '4'), // ABGR version of 16 bit FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE. FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE. FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE. // 1 Primary Compressed YUV format. FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'), // 14 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias. FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'), FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. FOURCC_J420 = FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc FOURCC_J422 = FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc FOURCC_J444 = FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc FOURCC_J400 = FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc FOURCC_F420 = FOURCC('F', '4', '2', '0'), // bt.709 full, unofficial fourcc FOURCC_F422 = FOURCC('F', '4', '2', '2'), // bt.709 full, unofficial fourcc FOURCC_F444 = FOURCC('F', '4', '4', '4'), // bt.709 full, unofficial fourcc FOURCC_H420 = FOURCC('H', '4', '2', '0'), // bt.709, unofficial fourcc FOURCC_H422 = FOURCC('H', '4', '2', '2'), // bt.709, unofficial fourcc FOURCC_H444 = FOURCC('H', '4', '4', '4'), // bt.709, unofficial fourcc FOURCC_U420 = FOURCC('U', '4', '2', '0'), // bt.2020, unofficial fourcc FOURCC_U422 = FOURCC('U', '4', '2', '2'), // bt.2020, unofficial fourcc FOURCC_U444 = FOURCC('U', '4', '4', '4'), // bt.2020, unofficial fourcc FOURCC_F010 = FOURCC('F', '0', '1', '0'), // bt.709 full range 10 bit 420 FOURCC_H010 = FOURCC('H', '0', '1', '0'), // bt.709 10 bit 420 FOURCC_U010 = FOURCC('U', '0', '1', '0'), // bt.2020 10 bit 420 FOURCC_F210 = FOURCC('F', '2', '1', '0'), // bt.709 full range 10 bit 422 FOURCC_H210 = FOURCC('H', '2', '1', '0'), // bt.709 10 bit 422 FOURCC_U210 = FOURCC('U', '2', '1', '0'), // bt.2020 10 bit 422 FOURCC_P010 = FOURCC('P', '0', '1', '0'), FOURCC_P210 = FOURCC('P', '2', '1', '0'), // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc. FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420. FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422. FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444. FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2. FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac. FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY. FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac. FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG. FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac. FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR. FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW. FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG. FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO. FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP. FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO. // deprecated formats. Not supported, but defined for backward compatibility. FOURCC_I411 = FOURCC('I', '4', '1', '1'), FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'), FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'), FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'), FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'), FOURCC_H264 = FOURCC('H', '2', '6', '4'), // Match any fourcc. FOURCC_ANY = -1, }; enum FourCCBpp { // Canonical fourcc codes used in our code. FOURCC_BPP_I420 = 12, FOURCC_BPP_I422 = 16, FOURCC_BPP_I444 = 24, FOURCC_BPP_I411 = 12, FOURCC_BPP_I400 = 8, FOURCC_BPP_NV21 = 12, FOURCC_BPP_NV12 = 12, FOURCC_BPP_YUY2 = 16, FOURCC_BPP_UYVY = 16, FOURCC_BPP_M420 = 12, // deprecated FOURCC_BPP_Q420 = 12, FOURCC_BPP_ARGB = 32, FOURCC_BPP_BGRA = 32, FOURCC_BPP_ABGR = 32, FOURCC_BPP_RGBA = 32, FOURCC_BPP_AR30 = 32, FOURCC_BPP_AB30 = 32, FOURCC_BPP_AR64 = 64, FOURCC_BPP_AB64 = 64, FOURCC_BPP_24BG = 24, FOURCC_BPP_RAW = 24, FOURCC_BPP_RGBP = 16, FOURCC_BPP_RGBO = 16, FOURCC_BPP_R444 = 16, FOURCC_BPP_RGGB = 8, FOURCC_BPP_BGGR = 8, FOURCC_BPP_GRBG = 8, FOURCC_BPP_GBRG = 8, FOURCC_BPP_YV12 = 12, FOURCC_BPP_YV16 = 16, FOURCC_BPP_YV24 = 24, FOURCC_BPP_YU12 = 12, FOURCC_BPP_J420 = 12, FOURCC_BPP_J400 = 8, FOURCC_BPP_H420 = 12, FOURCC_BPP_H422 = 16, FOURCC_BPP_I010 = 15, FOURCC_BPP_I210 = 20, FOURCC_BPP_H010 = 15, FOURCC_BPP_H210 = 20, FOURCC_BPP_P010 = 15, FOURCC_BPP_P210 = 20, FOURCC_BPP_MJPG = 0, // 0 means unknown. FOURCC_BPP_H264 = 0, FOURCC_BPP_IYUV = 12, FOURCC_BPP_YU16 = 16, FOURCC_BPP_YU24 = 24, FOURCC_BPP_YUYV = 16, FOURCC_BPP_YUVS = 16, FOURCC_BPP_HDYC = 16, FOURCC_BPP_2VUY = 16, FOURCC_BPP_JPEG = 1, FOURCC_BPP_DMB1 = 1, FOURCC_BPP_BA81 = 8, FOURCC_BPP_RGB3 = 24, FOURCC_BPP_BGR3 = 24, FOURCC_BPP_CM32 = 32, FOURCC_BPP_CM24 = 24, // Match any fourcc. FOURCC_BPP_ANY = 0, // 0 means unknown. }; // Converts fourcc aliases into canonical ones. LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc); #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ libyuv-0.0~git20220104.b91df1a/libyuv.gni000066400000000000000000000016141416500237200175410ustar00rootroot00000000000000# Copyright 2016 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. import("//build_overrides/build.gni") import("//build/config/arm.gni") import("//build/config/mips.gni") declare_args() { libyuv_include_tests = !build_with_chromium libyuv_disable_jpeg = false libyuv_use_neon = current_cpu == "arm64" || (current_cpu == "arm" && (arm_use_neon || arm_optionally_use_neon)) libyuv_use_msa = (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_msa libyuv_use_mmi = (current_cpu == "mips64el" || current_cpu == "mipsel") && mips_use_mmi } libyuv-0.0~git20220104.b91df1a/linux.mk000066400000000000000000000055021416500237200172200ustar00rootroot00000000000000# This is a generic makefile for libyuv for gcc. # make -f linux.mk CXX=clang++ CC?=gcc CFLAGS?=-O2 -fomit-frame-pointer CFLAGS+=-Iinclude/ CXX?=g++ CXXFLAGS?=-O2 -fomit-frame-pointer CXXFLAGS+=-Iinclude/ LOCAL_OBJ_FILES := \ source/compare.o \ source/compare_common.o \ source/compare_gcc.o \ source/compare_mmi.o \ source/compare_msa.o \ source/compare_neon.o \ source/compare_neon64.o \ source/compare_win.o \ source/convert.o \ source/convert_argb.o \ source/convert_from.o \ source/convert_from_argb.o \ source/convert_jpeg.o \ source/convert_to_argb.o \ source/convert_to_i420.o \ source/cpu_id.o \ source/mjpeg_decoder.o \ source/mjpeg_validate.o \ source/planar_functions.o \ source/rotate.o \ source/rotate_any.o \ source/rotate_argb.o \ source/rotate_common.o \ source/rotate_gcc.o \ source/rotate_mmi.o \ source/rotate_msa.o \ source/rotate_neon.o \ source/rotate_neon64.o \ source/rotate_win.o \ source/row_any.o \ source/row_common.o \ source/row_gcc.o \ source/row_mmi.o \ source/row_msa.o \ source/row_neon.o \ source/row_neon64.o \ source/row_win.o \ source/scale.o \ source/scale_any.o \ source/scale_argb.o \ source/scale_common.o \ source/scale_gcc.o \ source/scale_mmi.o \ source/scale_msa.o \ source/scale_neon.o \ source/scale_neon64.o \ source/scale_uv.o \ source/scale_win.o \ source/video_common.o .cc.o: $(CXX) -c $(CXXFLAGS) $*.cc -o $*.o .c.o: $(CC) -c $(CFLAGS) $*.c -o $*.o all: libyuv.a i444tonv12_eg yuvconvert yuvconstants cpuid psnr libyuv.a: $(LOCAL_OBJ_FILES) $(AR) $(ARFLAGS) $@ $(LOCAL_OBJ_FILES) # A C++ test utility that uses libyuv conversion. yuvconvert: util/yuvconvert.cc libyuv.a $(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/yuvconvert.cc libyuv.a # A C test utility that generates yuvconstants for yuv to rgb. yuvconstants: util/yuvconstants.c libyuv.a $(CXX) $(CXXFLAGS) -Iutil/ -lm -o $@ util/yuvconstants.c libyuv.a # A standalone test utility psnr: util/psnr.cc $(CXX) $(CXXFLAGS) -Iutil/ -o $@ util/psnr.cc util/psnr_main.cc util/ssim.cc # A simple conversion example. i444tonv12_eg: util/i444tonv12_eg.cc libyuv.a $(CXX) $(CXXFLAGS) -o $@ util/i444tonv12_eg.cc libyuv.a # A C test utility that uses libyuv conversion from C. # gcc 4.4 and older require -fno-exceptions to avoid link error on __gxx_personality_v0 # CC=gcc-4.4 CXXFLAGS=-fno-exceptions CXX=g++-4.4 make -f linux.mk cpuid: util/cpuid.c libyuv.a $(CC) $(CFLAGS) -o $@ util/cpuid.c libyuv.a clean: /bin/rm -f source/*.o *.ii *.s libyuv.a i444tonv12_eg yuvconvert yuvconstants cpuid psnr libyuv-0.0~git20220104.b91df1a/public.mk000066400000000000000000000004731416500237200173410ustar00rootroot00000000000000# This file contains all the common make variables which are useful for # anyone depending on this library. # Note that dependencies on NDK are not directly listed since NDK auto adds # them. LIBYUV_INCLUDES := $(LIBYUV_PATH)/include LIBYUV_C_FLAGS := LIBYUV_CPP_FLAGS := LIBYUV_LDLIBS := LIBYUV_DEP_MODULES := libyuv-0.0~git20220104.b91df1a/pylintrc000066400000000000000000000013001416500237200173070ustar00rootroot00000000000000[MESSAGES CONTROL] # Disable the message, report, category or checker with the given id(s). # TODO(kjellander): Reduce this list to as small as possible. disable=I0010,I0011,bad-continuation,broad-except,duplicate-code,eval-used,exec-used,fixme,invalid-name,missing-docstring,no-init,no-member,too-few-public-methods,too-many-ancestors,too-many-arguments,too-many-branches,too-many-function-args,too-many-instance-attributes,too-many-lines,too-many-locals,too-many-public-methods,too-many-return-statements,too-many-statements [REPORTS] # Don't write out full reports, just messages. reports=no [FORMAT] # We use two spaces for indents, instead of the usual four spaces or tab. indent-string=' ' libyuv-0.0~git20220104.b91df1a/source/000077500000000000000000000000001416500237200170265ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/source/compare.cc000066400000000000000000000312611416500237200207660ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/compare.h" #include #include #ifdef _OPENMP #include #endif #include "libyuv/basic_types.h" #include "libyuv/compare_row.h" #include "libyuv/cpu_id.h" #include "libyuv/row.h" #include "libyuv/video_common.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // hash seed of 5381 recommended. LIBYUV_API uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { const int kBlockSize = 1 << 15; // 32768; int remainder; uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) = HashDjb2_C; #if defined(HAS_HASHDJB2_SSE41) if (TestCpuFlag(kCpuHasSSE41)) { HashDjb2_SSE = HashDjb2_SSE41; } #endif #if defined(HAS_HASHDJB2_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { HashDjb2_SSE = HashDjb2_AVX2; } #endif while (count >= (uint64_t)(kBlockSize)) { seed = HashDjb2_SSE(src, kBlockSize, seed); src += kBlockSize; count -= kBlockSize; } remainder = (int)count & ~15; if (remainder) { seed = HashDjb2_SSE(src, remainder, seed); src += remainder; count -= remainder; } remainder = (int)count & 15; if (remainder) { seed = HashDjb2_C(src, remainder, seed); } return seed; } static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) { int x; for (x = 0; x < width - 1; x += 2) { if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. return FOURCC_BGRA; } if (argb[3] != 255) { // Fourth byte is not Alpha of 255, so not BGRA. return FOURCC_ARGB; } if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255. return FOURCC_BGRA; } if (argb[7] != 255) { // Second pixel fourth byte is not Alpha of 255. return FOURCC_ARGB; } argb += 8; } if (width & 1) { if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB. return FOURCC_BGRA; } if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA. return FOURCC_ARGB; } } return 0; } // Scan an opaque argb image and return fourcc based on alpha offset. // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown. LIBYUV_API uint32_t ARGBDetect(const uint8_t* argb, int stride_argb, int width, int height) { uint32_t fourcc = 0; int h; // Coalesce rows. if (stride_argb == width * 4) { width *= height; height = 1; stride_argb = 0; } for (h = 0; h < height && fourcc == 0; ++h) { fourcc = ARGBDetectRow_C(argb, width); argb += stride_argb; } return fourcc; } // NEON version accumulates in 16 bit shorts which overflow at 65536 bytes. // So actual maximum is 1 less loop, which is 64436 - 32 bytes. LIBYUV_API uint64_t ComputeHammingDistance(const uint8_t* src_a, const uint8_t* src_b, int count) { const int kBlockSize = 1 << 15; // 32768; const int kSimdSize = 64; // SIMD for multiple of 64, and C for remainder int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1); uint64_t diff = 0; int i; uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b, int count) = HammingDistance_C; #if defined(HAS_HAMMINGDISTANCE_NEON) if (TestCpuFlag(kCpuHasNEON)) { HammingDistance = HammingDistance_NEON; } #endif #if defined(HAS_HAMMINGDISTANCE_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { HammingDistance = HammingDistance_SSSE3; } #endif #if defined(HAS_HAMMINGDISTANCE_SSE42) if (TestCpuFlag(kCpuHasSSE42)) { HammingDistance = HammingDistance_SSE42; } #endif #if defined(HAS_HAMMINGDISTANCE_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { HammingDistance = HammingDistance_AVX2; } #endif #if defined(HAS_HAMMINGDISTANCE_MMI) if (TestCpuFlag(kCpuHasMMI)) { HammingDistance = HammingDistance_MMI; } #endif #if defined(HAS_HAMMINGDISTANCE_MSA) if (TestCpuFlag(kCpuHasMSA)) { HammingDistance = HammingDistance_MSA; } #endif #ifdef _OPENMP #pragma omp parallel for reduction(+ : diff) #endif for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { diff += HammingDistance(src_a + i, src_b + i, kBlockSize); } src_a += count & ~(kBlockSize - 1); src_b += count & ~(kBlockSize - 1); if (remainder) { diff += HammingDistance(src_a, src_b, remainder); src_a += remainder; src_b += remainder; } remainder = count & (kSimdSize - 1); if (remainder) { diff += HammingDistance_C(src_a, src_b, remainder); } return diff; } // TODO(fbarchard): Refactor into row function. LIBYUV_API uint64_t ComputeSumSquareError(const uint8_t* src_a, const uint8_t* src_b, int count) { // SumSquareError returns values 0 to 65535 for each squared difference. // Up to 65536 of those can be summed and remain within a uint32_t. // After each block of 65536 pixels, accumulate into a uint64_t. const int kBlockSize = 65536; int remainder = count & (kBlockSize - 1) & ~31; uint64_t sse = 0; int i; uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b, int count) = SumSquareError_C; #if defined(HAS_SUMSQUAREERROR_NEON) if (TestCpuFlag(kCpuHasNEON)) { SumSquareError = SumSquareError_NEON; } #endif #if defined(HAS_SUMSQUAREERROR_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { // Note only used for multiples of 16 so count is not checked. SumSquareError = SumSquareError_SSE2; } #endif #if defined(HAS_SUMSQUAREERROR_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { // Note only used for multiples of 32 so count is not checked. SumSquareError = SumSquareError_AVX2; } #endif #if defined(HAS_SUMSQUAREERROR_MMI) if (TestCpuFlag(kCpuHasMMI)) { SumSquareError = SumSquareError_MMI; } #endif #if defined(HAS_SUMSQUAREERROR_MSA) if (TestCpuFlag(kCpuHasMSA)) { SumSquareError = SumSquareError_MSA; } #endif #ifdef _OPENMP #pragma omp parallel for reduction(+ : sse) #endif for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { sse += SumSquareError(src_a + i, src_b + i, kBlockSize); } src_a += count & ~(kBlockSize - 1); src_b += count & ~(kBlockSize - 1); if (remainder) { sse += SumSquareError(src_a, src_b, remainder); src_a += remainder; src_b += remainder; } remainder = count & 31; if (remainder) { sse += SumSquareError_C(src_a, src_b, remainder); } return sse; } LIBYUV_API uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a, int stride_a, const uint8_t* src_b, int stride_b, int width, int height) { uint64_t sse = 0; int h; // Coalesce rows. if (stride_a == width && stride_b == width) { width *= height; height = 1; stride_a = stride_b = 0; } for (h = 0; h < height; ++h) { sse += ComputeSumSquareError(src_a, src_b, width); src_a += stride_a; src_b += stride_b; } return sse; } LIBYUV_API double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) { double psnr; if (sse > 0) { double mse = (double)count / (double)sse; psnr = 10.0 * log10(255.0 * 255.0 * mse); } else { psnr = kMaxPsnr; // Limit to prevent divide by 0 } if (psnr > kMaxPsnr) { psnr = kMaxPsnr; } return psnr; } LIBYUV_API double CalcFramePsnr(const uint8_t* src_a, int stride_a, const uint8_t* src_b, int stride_b, int width, int height) { const uint64_t samples = (uint64_t)width * (uint64_t)height; const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b, stride_b, width, height); return SumSquareErrorToPsnr(sse, samples); } LIBYUV_API double I420Psnr(const uint8_t* src_y_a, int stride_y_a, const uint8_t* src_u_a, int stride_u_a, const uint8_t* src_v_a, int stride_v_a, const uint8_t* src_y_b, int stride_y_b, const uint8_t* src_u_b, int stride_u_b, const uint8_t* src_v_b, int stride_v_b, int width, int height) { const uint64_t sse_y = ComputeSumSquareErrorPlane( src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); const int width_uv = (width + 1) >> 1; const int height_uv = (height + 1) >> 1; const uint64_t sse_u = ComputeSumSquareErrorPlane( src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); const uint64_t sse_v = ComputeSumSquareErrorPlane( src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); const uint64_t samples = (uint64_t)width * (uint64_t)height + 2 * ((uint64_t)width_uv * (uint64_t)height_uv); const uint64_t sse = sse_y + sse_u + sse_v; return SumSquareErrorToPsnr(sse, samples); } static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 static double Ssim8x8_C(const uint8_t* src_a, int stride_a, const uint8_t* src_b, int stride_b) { int64_t sum_a = 0; int64_t sum_b = 0; int64_t sum_sq_a = 0; int64_t sum_sq_b = 0; int64_t sum_axb = 0; int i; for (i = 0; i < 8; ++i) { int j; for (j = 0; j < 8; ++j) { sum_a += src_a[j]; sum_b += src_b[j]; sum_sq_a += src_a[j] * src_a[j]; sum_sq_b += src_b[j] * src_b[j]; sum_axb += src_a[j] * src_b[j]; } src_a += stride_a; src_b += stride_b; } { const int64_t count = 64; // scale the constants by number of pixels const int64_t c1 = (cc1 * count * count) >> 12; const int64_t c2 = (cc2 * count * count) >> 12; const int64_t sum_a_x_sum_b = sum_a * sum_b; const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) * (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2); const int64_t sum_a_sq = sum_a * sum_a; const int64_t sum_b_sq = sum_b * sum_b; const int64_t ssim_d = (sum_a_sq + sum_b_sq + c1) * (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2); if (ssim_d == 0.0) { return DBL_MAX; } return ssim_n * 1.0 / ssim_d; } } // We are using a 8x8 moving window with starting location of each 8x8 window // on the 4x4 pixel grid. Such arrangement allows the windows to overlap // block boundaries to penalize blocking artifacts. LIBYUV_API double CalcFrameSsim(const uint8_t* src_a, int stride_a, const uint8_t* src_b, int stride_b, int width, int height) { int samples = 0; double ssim_total = 0; double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b, int stride_b) = Ssim8x8_C; // sample point start with each 4x4 location int i; for (i = 0; i < height - 8; i += 4) { int j; for (j = 0; j < width - 8; j += 4) { ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b); samples++; } src_a += stride_a * 4; src_b += stride_b * 4; } ssim_total /= samples; return ssim_total; } LIBYUV_API double I420Ssim(const uint8_t* src_y_a, int stride_y_a, const uint8_t* src_u_a, int stride_u_a, const uint8_t* src_v_a, int stride_v_a, const uint8_t* src_y_b, int stride_y_b, const uint8_t* src_u_b, int stride_u_b, const uint8_t* src_v_b, int stride_v_b, int width, int height) { const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height); const int width_uv = (width + 1) >> 1; const int height_uv = (height + 1) >> 1; const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv); const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv); return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v); } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/compare_common.cc000066400000000000000000000036201416500237200223340ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/basic_types.h" #include "libyuv/compare_row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Hakmem method for hamming distance. uint32_t HammingDistance_C(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff = 0u; int i; for (i = 0; i < count - 3; i += 4) { uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b); uint32_t u = x - ((x >> 1) & 0x55555555); u = ((u >> 2) & 0x33333333) + (u & 0x33333333); diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24); src_a += 4; src_b += 4; } for (; i < count; ++i) { uint32_t x = *src_a ^ *src_b; uint32_t u = x - ((x >> 1) & 0x55); u = ((u >> 2) & 0x33) + (u & 0x33); diff += (u + (u >> 4)) & 0x0f; src_a += 1; src_b += 1; } return diff; } uint32_t SumSquareError_C(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse = 0u; int i; for (i = 0; i < count; ++i) { int diff = src_a[i] - src_b[i]; sse += (uint32_t)(diff * diff); } return sse; } // hash seed of 5381 recommended. // Internal C version of HashDjb2 with int sized count for efficiency. uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) { uint32_t hash = seed; int i; for (i = 0; i < count; ++i) { hash += (hash << 5) + src[i]; } return hash; } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/compare_gcc.cc000066400000000000000000000336301416500237200216040ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/basic_types.h" #include "libyuv/compare_row.h" #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for GCC x86 and x64. #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #if defined(__x86_64__) uint32_t HammingDistance_SSE42(const uint8_t* src_a, const uint8_t* src_b, int count) { uint64_t diff = 0u; asm volatile( "xor %3,%3 \n" "xor %%r8,%%r8 \n" "xor %%r9,%%r9 \n" "xor %%r10,%%r10 \n" // Process 32 bytes per loop. LABELALIGN "1: \n" "mov (%0),%%rcx \n" "mov 0x8(%0),%%rdx \n" "xor (%1),%%rcx \n" "xor 0x8(%1),%%rdx \n" "popcnt %%rcx,%%rcx \n" "popcnt %%rdx,%%rdx \n" "mov 0x10(%0),%%rsi \n" "mov 0x18(%0),%%rdi \n" "xor 0x10(%1),%%rsi \n" "xor 0x18(%1),%%rdi \n" "popcnt %%rsi,%%rsi \n" "popcnt %%rdi,%%rdi \n" "add $0x20,%0 \n" "add $0x20,%1 \n" "add %%rcx,%3 \n" "add %%rdx,%%r8 \n" "add %%rsi,%%r9 \n" "add %%rdi,%%r10 \n" "sub $0x20,%2 \n" "jg 1b \n" "add %%r8, %3 \n" "add %%r9, %3 \n" "add %%r10, %3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 "=r"(diff) // %3 : : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10"); return static_cast(diff); } #else uint32_t HammingDistance_SSE42(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff = 0u; asm volatile( // Process 16 bytes per loop. LABELALIGN "1: \n" "mov (%0),%%ecx \n" "mov 0x4(%0),%%edx \n" "xor (%1),%%ecx \n" "xor 0x4(%1),%%edx \n" "popcnt %%ecx,%%ecx \n" "add %%ecx,%3 \n" "popcnt %%edx,%%edx \n" "add %%edx,%3 \n" "mov 0x8(%0),%%ecx \n" "mov 0xc(%0),%%edx \n" "xor 0x8(%1),%%ecx \n" "xor 0xc(%1),%%edx \n" "popcnt %%ecx,%%ecx \n" "add %%ecx,%3 \n" "popcnt %%edx,%%edx \n" "add %%edx,%3 \n" "add $0x10,%0 \n" "add $0x10,%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 "+r"(diff) // %3 : : "memory", "cc", "ecx", "edx"); return diff; } #endif static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15}; static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; uint32_t HammingDistance_SSSE3(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff = 0u; asm volatile( "movdqa %4,%%xmm2 \n" "movdqa %5,%%xmm3 \n" "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm1,%%xmm1 \n" "sub %0,%1 \n" LABELALIGN "1: \n" "movdqa (%0),%%xmm4 \n" "movdqa 0x10(%0), %%xmm5 \n" "pxor (%0,%1), %%xmm4 \n" "movdqa %%xmm4,%%xmm6 \n" "pand %%xmm2,%%xmm6 \n" "psrlw $0x4,%%xmm4 \n" "movdqa %%xmm3,%%xmm7 \n" "pshufb %%xmm6,%%xmm7 \n" "pand %%xmm2,%%xmm4 \n" "movdqa %%xmm3,%%xmm6 \n" "pshufb %%xmm4,%%xmm6 \n" "paddb %%xmm7,%%xmm6 \n" "pxor 0x10(%0,%1),%%xmm5 \n" "add $0x20,%0 \n" "movdqa %%xmm5,%%xmm4 \n" "pand %%xmm2,%%xmm5 \n" "psrlw $0x4,%%xmm4 \n" "movdqa %%xmm3,%%xmm7 \n" "pshufb %%xmm5,%%xmm7 \n" "pand %%xmm2,%%xmm4 \n" "movdqa %%xmm3,%%xmm5 \n" "pshufb %%xmm4,%%xmm5 \n" "paddb %%xmm7,%%xmm5 \n" "paddb %%xmm5,%%xmm6 \n" "psadbw %%xmm1,%%xmm6 \n" "paddd %%xmm6,%%xmm0 \n" "sub $0x20,%2 \n" "jg 1b \n" "pshufd $0xaa,%%xmm0,%%xmm1 \n" "paddd %%xmm1,%%xmm0 \n" "movd %%xmm0, %3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 "=r"(diff) // %3 : "m"(kNibbleMask), // %4 "m"(kBitCount) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); return diff; } #ifdef HAS_HAMMINGDISTANCE_AVX2 uint32_t HammingDistance_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff = 0u; asm volatile( "vbroadcastf128 %4,%%ymm2 \n" "vbroadcastf128 %5,%%ymm3 \n" "vpxor %%ymm0,%%ymm0,%%ymm0 \n" "vpxor %%ymm1,%%ymm1,%%ymm1 \n" "sub %0,%1 \n" LABELALIGN "1: \n" "vmovdqa (%0),%%ymm4 \n" "vmovdqa 0x20(%0), %%ymm5 \n" "vpxor (%0,%1), %%ymm4, %%ymm4 \n" "vpand %%ymm2,%%ymm4,%%ymm6 \n" "vpsrlw $0x4,%%ymm4,%%ymm4 \n" "vpshufb %%ymm6,%%ymm3,%%ymm6 \n" "vpand %%ymm2,%%ymm4,%%ymm4 \n" "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" "vpaddb %%ymm4,%%ymm6,%%ymm6 \n" "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n" "add $0x40,%0 \n" "vpand %%ymm2,%%ymm4,%%ymm5 \n" "vpsrlw $0x4,%%ymm4,%%ymm4 \n" "vpshufb %%ymm5,%%ymm3,%%ymm5 \n" "vpand %%ymm2,%%ymm4,%%ymm4 \n" "vpshufb %%ymm4,%%ymm3,%%ymm4 \n" "vpaddb %%ymm5,%%ymm4,%%ymm4 \n" "vpaddb %%ymm6,%%ymm4,%%ymm4 \n" "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n" "vpaddd %%ymm0,%%ymm4,%%ymm0 \n" "sub $0x40,%2 \n" "jg 1b \n" "vpermq $0xb1,%%ymm0,%%ymm1 \n" "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xaa,%%ymm0,%%ymm1 \n" "vpaddd %%ymm1,%%ymm0,%%ymm0 \n" "vmovd %%xmm0, %3 \n" "vzeroupper \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 "=r"(diff) // %3 : "m"(kNibbleMask), // %4 "m"(kBitCount) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); return diff; } #endif // HAS_HAMMINGDISTANCE_AVX2 uint32_t SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse; asm volatile( "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm1 \n" "lea 0x10(%0),%0 \n" "movdqu (%1),%%xmm2 \n" "lea 0x10(%1),%1 \n" "movdqa %%xmm1,%%xmm3 \n" "psubusb %%xmm2,%%xmm1 \n" "psubusb %%xmm3,%%xmm2 \n" "por %%xmm2,%%xmm1 \n" "movdqa %%xmm1,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm1 \n" "punpckhbw %%xmm5,%%xmm2 \n" "pmaddwd %%xmm1,%%xmm1 \n" "pmaddwd %%xmm2,%%xmm2 \n" "paddd %%xmm1,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n" "sub $0x10,%2 \n" "jg 1b \n" "pshufd $0xee,%%xmm0,%%xmm1 \n" "paddd %%xmm1,%%xmm0 \n" "pshufd $0x1,%%xmm0,%%xmm1 \n" "paddd %%xmm1,%%xmm0 \n" "movd %%xmm0,%3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 "=g"(sse) // %3 ::"memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); return sse; } static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 static const uvec32 kHashMul0 = { 0x0c3525e1, // 33 ^ 15 0xa3476dc1, // 33 ^ 14 0x3b4039a1, // 33 ^ 13 0x4f5f0981, // 33 ^ 12 }; static const uvec32 kHashMul1 = { 0x30f35d61, // 33 ^ 11 0x855cb541, // 33 ^ 10 0x040a9121, // 33 ^ 9 0x747c7101, // 33 ^ 8 }; static const uvec32 kHashMul2 = { 0xec41d4e1, // 33 ^ 7 0x4cfa3cc1, // 33 ^ 6 0x025528a1, // 33 ^ 5 0x00121881, // 33 ^ 4 }; static const uvec32 kHashMul3 = { 0x00008c61, // 33 ^ 3 0x00000441, // 33 ^ 2 0x00000021, // 33 ^ 1 0x00000001, // 33 ^ 0 }; uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { uint32_t hash; asm volatile( "movd %2,%%xmm0 \n" "pxor %%xmm7,%%xmm7 \n" "movdqa %4,%%xmm6 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm1 \n" "lea 0x10(%0),%0 \n" "pmulld %%xmm6,%%xmm0 \n" "movdqa %5,%%xmm5 \n" "movdqa %%xmm1,%%xmm2 \n" "punpcklbw %%xmm7,%%xmm2 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklwd %%xmm7,%%xmm3 \n" "pmulld %%xmm5,%%xmm3 \n" "movdqa %6,%%xmm5 \n" "movdqa %%xmm2,%%xmm4 \n" "punpckhwd %%xmm7,%%xmm4 \n" "pmulld %%xmm5,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "punpckhbw %%xmm7,%%xmm1 \n" "movdqa %%xmm1,%%xmm2 \n" "punpcklwd %%xmm7,%%xmm2 \n" "pmulld %%xmm5,%%xmm2 \n" "movdqa %8,%%xmm5 \n" "punpckhwd %%xmm7,%%xmm1 \n" "pmulld %%xmm5,%%xmm1 \n" "paddd %%xmm4,%%xmm3 \n" "paddd %%xmm2,%%xmm1 \n" "paddd %%xmm3,%%xmm1 \n" "pshufd $0xe,%%xmm1,%%xmm2 \n" "paddd %%xmm2,%%xmm1 \n" "pshufd $0x1,%%xmm1,%%xmm2 \n" "paddd %%xmm2,%%xmm1 \n" "paddd %%xmm1,%%xmm0 \n" "sub $0x10,%1 \n" "jg 1b \n" "movd %%xmm0,%3 \n" : "+r"(src), // %0 "+r"(count), // %1 "+rm"(seed), // %2 "=g"(hash) // %3 : "m"(kHash16x33), // %4 "m"(kHashMul0), // %5 "m"(kHashMul1), // %6 "m"(kHashMul2), // %7 "m"(kHashMul3) // %8 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); return hash; } #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/compare_mmi.cc000066400000000000000000000117571416500237200216400ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/basic_types.h" #include "libyuv/compare_row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for Mips MMI. #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) // Hakmem method for hamming distance. uint32_t HammingDistance_MMI(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff = 0u; uint64_t temp = 0, temp1 = 0, ta = 0, tb = 0; uint64_t c1 = 0x5555555555555555; uint64_t c2 = 0x3333333333333333; uint64_t c3 = 0x0f0f0f0f0f0f0f0f; uint32_t c4 = 0x01010101; uint64_t s1 = 1, s2 = 2, s3 = 4; __asm__ volatile( "1: \n\t" "ldc1 %[ta], 0(%[src_a]) \n\t" "ldc1 %[tb], 0(%[src_b]) \n\t" "xor %[temp], %[ta], %[tb] \n\t" "psrlw %[temp1], %[temp], %[s1] \n\t" // temp1=x>>1 "and %[temp1], %[temp1], %[c1] \n\t" // temp1&=c1 "psubw %[temp1], %[temp], %[temp1] \n\t" // x-temp1 "and %[temp], %[temp1], %[c2] \n\t" // t = (u&c2) "psrlw %[temp1], %[temp1], %[s2] \n\t" // u>>2 "and %[temp1], %[temp1], %[c2] \n\t" // u>>2 & c2 "paddw %[temp1], %[temp1], %[temp] \n\t" // t1 = t1+t "psrlw %[temp], %[temp1], %[s3] \n\t" // u>>4 "paddw %[temp1], %[temp1], %[temp] \n\t" // u+(u>>4) "and %[temp1], %[temp1], %[c3] \n\t" //&c3 "dmfc1 $t0, %[temp1] \n\t" "dsrl32 $t0, $t0, 0 \n\t " "mul $t0, $t0, %[c4] \n\t" "dsrl $t0, $t0, 24 \n\t" "dadd %[diff], %[diff], $t0 \n\t" "dmfc1 $t0, %[temp1] \n\t" "mul $t0, $t0, %[c4] \n\t" "dsrl $t0, $t0, 24 \n\t" "dadd %[diff], %[diff], $t0 \n\t" "daddiu %[src_a], %[src_a], 8 \n\t" "daddiu %[src_b], %[src_b], 8 \n\t" "addiu %[count], %[count], -8 \n\t" "bgtz %[count], 1b \n\t" "nop \n\t" : [diff] "+r"(diff), [src_a] "+r"(src_a), [src_b] "+r"(src_b), [count] "+r"(count), [ta] "+f"(ta), [tb] "+f"(tb), [temp] "+f"(temp), [temp1] "+f"(temp1) : [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "r"(c4), [s1] "f"(s1), [s2] "f"(s2), [s3] "f"(s3) : "memory"); return diff; } uint32_t SumSquareError_MMI(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse = 0u; uint32_t sse_hi = 0u, sse_lo = 0u; uint64_t src1, src2; uint64_t diff, diff_hi, diff_lo; uint64_t sse_sum, sse_tmp; const uint64_t mask = 0x0ULL; __asm__ volatile( "xor %[sse_sum], %[sse_sum], %[sse_sum] \n\t" "1: \n\t" "ldc1 %[src1], 0x00(%[src_a]) \n\t" "ldc1 %[src2], 0x00(%[src_b]) \n\t" "pasubub %[diff], %[src1], %[src2] \n\t" "punpcklbh %[diff_lo], %[diff], %[mask] \n\t" "punpckhbh %[diff_hi], %[diff], %[mask] \n\t" "pmaddhw %[sse_tmp], %[diff_lo], %[diff_lo] \n\t" "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" "pmaddhw %[sse_tmp], %[diff_hi], %[diff_hi] \n\t" "paddw %[sse_sum], %[sse_sum], %[sse_tmp] \n\t" "daddiu %[src_a], %[src_a], 0x08 \n\t" "daddiu %[src_b], %[src_b], 0x08 \n\t" "daddiu %[count], %[count], -0x08 \n\t" "bnez %[count], 1b \n\t" "mfc1 %[sse_lo], %[sse_sum] \n\t" "mfhc1 %[sse_hi], %[sse_sum] \n\t" "daddu %[sse], %[sse_hi], %[sse_lo] \n\t" : [sse] "+&r"(sse), [diff] "=&f"(diff), [src1] "=&f"(src1), [src2] "=&f"(src2), [diff_lo] "=&f"(diff_lo), [diff_hi] "=&f"(diff_hi), [sse_sum] "=&f"(sse_sum), [sse_tmp] "=&f"(sse_tmp), [sse_hi] "+&r"(sse_hi), [sse_lo] "+&r"(sse_lo) : [src_a] "r"(src_a), [src_b] "r"(src_b), [count] "r"(count), [mask] "f"(mask) : "memory"); return sse; } #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/compare_msa.cc000066400000000000000000000056341416500237200216330ustar00rootroot00000000000000/* * Copyright 2017 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/basic_types.h" #include "libyuv/compare_row.h" #include "libyuv/row.h" // This module is for GCC MSA #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #include "libyuv/macros_msa.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif uint32_t HammingDistance_MSA(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff = 0u; int i; v16u8 src0, src1, src2, src3; v2i64 vec0 = {0}, vec1 = {0}; for (i = 0; i < count; i += 32) { src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); src0 ^= src2; src1 ^= src3; vec0 += __msa_pcnt_d((v2i64)src0); vec1 += __msa_pcnt_d((v2i64)src1); src_a += 32; src_b += 32; } vec0 += vec1; diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0); diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2); return diff; } uint32_t SumSquareError_MSA(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse = 0u; int i; v16u8 src0, src1, src2, src3; v8i16 vec0, vec1, vec2, vec3; v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0}; v2i64 tmp0; for (i = 0; i < count; i += 32) { src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0); src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16); src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0); src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16); vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); reg0 = __msa_dpadd_s_w(reg0, vec0, vec0); reg1 = __msa_dpadd_s_w(reg1, vec1, vec1); reg2 = __msa_dpadd_s_w(reg2, vec2, vec2); reg3 = __msa_dpadd_s_w(reg3, vec3, vec3); src_a += 32; src_b += 32; } reg0 += reg1; reg2 += reg3; reg0 += reg2; tmp0 = __msa_hadd_s_d(reg0, reg0); sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0); sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2); return sse; } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) libyuv-0.0~git20220104.b91df1a/source/compare_neon.cc000066400000000000000000000067021416500237200220070ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/basic_types.h" #include "libyuv/compare_row.h" #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) // 256 bits at a time // uses short accumulator which restricts count to 131 KB uint32_t HammingDistance_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff; asm volatile( "vmov.u16 q4, #0 \n" // accumulator "1: \n" "vld1.8 {q0, q1}, [%0]! \n" "vld1.8 {q2, q3}, [%1]! \n" "veor.32 q0, q0, q2 \n" "veor.32 q1, q1, q3 \n" "vcnt.i8 q0, q0 \n" "vcnt.i8 q1, q1 \n" "subs %2, %2, #32 \n" "vadd.u8 q0, q0, q1 \n" // 16 byte counts "vpadal.u8 q4, q0 \n" // 8 shorts "bgt 1b \n" "vpaddl.u16 q0, q4 \n" // 4 ints "vpadd.u32 d0, d0, d1 \n" "vpadd.u32 d0, d0, d0 \n" "vmov.32 %3, d0[0] \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) : : "cc", "q0", "q1", "q2", "q3", "q4"); return diff; } uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse; asm volatile( "vmov.u8 q8, #0 \n" "vmov.u8 q10, #0 \n" "vmov.u8 q9, #0 \n" "vmov.u8 q11, #0 \n" "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q1}, [%1]! \n" "subs %2, %2, #16 \n" "vsubl.u8 q2, d0, d2 \n" "vsubl.u8 q3, d1, d3 \n" "vmlal.s16 q8, d4, d4 \n" "vmlal.s16 q9, d6, d6 \n" "vmlal.s16 q10, d5, d5 \n" "vmlal.s16 q11, d7, d7 \n" "bgt 1b \n" "vadd.u32 q8, q8, q9 \n" "vadd.u32 q10, q10, q11 \n" "vadd.u32 q11, q8, q10 \n" "vpaddl.u32 q1, q11 \n" "vadd.u64 d0, d2, d3 \n" "vmov.32 %3, d0[0] \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); return sse; } #endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/compare_neon64.cc000066400000000000000000000067171416500237200221670ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/basic_types.h" #include "libyuv/compare_row.h" #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // 256 bits at a time // uses short accumulator which restricts count to 131 KB uint32_t HammingDistance_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff; asm volatile( "movi v4.8h, #0 \n" "1: \n" "ld1 {v0.16b, v1.16b}, [%0], #32 \n" "ld1 {v2.16b, v3.16b}, [%1], #32 \n" "eor v0.16b, v0.16b, v2.16b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "eor v1.16b, v1.16b, v3.16b \n" "cnt v0.16b, v0.16b \n" "prfm pldl1keep, [%1, 448] \n" "cnt v1.16b, v1.16b \n" "subs %w2, %w2, #32 \n" "add v0.16b, v0.16b, v1.16b \n" "uadalp v4.8h, v0.16b \n" "b.gt 1b \n" "uaddlv s4, v4.8h \n" "fmov %w3, s4 \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff) : : "cc", "v0", "v1", "v2", "v3", "v4"); return diff; } uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse; asm volatile( "eor v16.16b, v16.16b, v16.16b \n" "eor v18.16b, v18.16b, v18.16b \n" "eor v17.16b, v17.16b, v17.16b \n" "eor v19.16b, v19.16b, v19.16b \n" "1: \n" "ld1 {v0.16b}, [%0], #16 \n" "ld1 {v1.16b}, [%1], #16 \n" "subs %w2, %w2, #16 \n" "usubl v2.8h, v0.8b, v1.8b \n" "usubl2 v3.8h, v0.16b, v1.16b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "smlal v16.4s, v2.4h, v2.4h \n" "smlal v17.4s, v3.4h, v3.4h \n" "prfm pldl1keep, [%1, 448] \n" "smlal2 v18.4s, v2.8h, v2.8h \n" "smlal2 v19.4s, v3.8h, v3.8h \n" "b.gt 1b \n" "add v16.4s, v16.4s, v17.4s \n" "add v18.4s, v18.4s, v19.4s \n" "add v19.4s, v16.4s, v18.4s \n" "addv s0, v19.4s \n" "fmov %w3, s0 \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); return sse; } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/compare_win.cc000066400000000000000000000151661416500237200216510ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/basic_types.h" #include "libyuv/compare_row.h" #include "libyuv/row.h" #if defined(_MSC_VER) #include // For __popcnt #endif #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for 32 bit Visual C x86 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ !defined(__clang__) && defined(_M_IX86) uint32_t HammingDistance_SSE42(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t diff = 0u; int i; for (i = 0; i < count - 3; i += 4) { uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT src_a += 4; src_b += 4; diff += __popcnt(x); } return diff; } __declspec(naked) uint32_t SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { __asm { mov eax, [esp + 4] // src_a mov edx, [esp + 8] // src_b mov ecx, [esp + 12] // count pxor xmm0, xmm0 pxor xmm5, xmm5 wloop: movdqu xmm1, [eax] lea eax, [eax + 16] movdqu xmm2, [edx] lea edx, [edx + 16] movdqa xmm3, xmm1 // abs trick psubusb xmm1, xmm2 psubusb xmm2, xmm3 por xmm1, xmm2 movdqa xmm2, xmm1 punpcklbw xmm1, xmm5 punpckhbw xmm2, xmm5 pmaddwd xmm1, xmm1 pmaddwd xmm2, xmm2 paddd xmm0, xmm1 paddd xmm0, xmm2 sub ecx, 16 jg wloop pshufd xmm1, xmm0, 0xee paddd xmm0, xmm1 pshufd xmm1, xmm0, 0x01 paddd xmm0, xmm1 movd eax, xmm0 ret } } #ifdef HAS_SUMSQUAREERROR_AVX2 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. #pragma warning(disable : 4752) __declspec(naked) uint32_t SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { __asm { mov eax, [esp + 4] // src_a mov edx, [esp + 8] // src_b mov ecx, [esp + 12] // count vpxor ymm0, ymm0, ymm0 // sum vpxor ymm5, ymm5, ymm5 // constant 0 for unpck sub edx, eax wloop: vmovdqu ymm1, [eax] vmovdqu ymm2, [eax + edx] lea eax, [eax + 32] vpsubusb ymm3, ymm1, ymm2 // abs difference trick vpsubusb ymm2, ymm2, ymm1 vpor ymm1, ymm2, ymm3 vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order. vpunpckhbw ymm1, ymm1, ymm5 vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32. vpmaddwd ymm1, ymm1, ymm1 vpaddd ymm0, ymm0, ymm1 vpaddd ymm0, ymm0, ymm2 sub ecx, 32 jg wloop vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes. vpaddd ymm0, ymm0, ymm1 vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes. vpaddd ymm0, ymm0, ymm1 vpermq ymm1, ymm0, 0x02 // high + low lane. vpaddd ymm0, ymm0, ymm1 vmovd eax, xmm0 vzeroupper ret } } #endif // HAS_SUMSQUAREERROR_AVX2 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16 uvec32 kHashMul0 = { 0x0c3525e1, // 33 ^ 15 0xa3476dc1, // 33 ^ 14 0x3b4039a1, // 33 ^ 13 0x4f5f0981, // 33 ^ 12 }; uvec32 kHashMul1 = { 0x30f35d61, // 33 ^ 11 0x855cb541, // 33 ^ 10 0x040a9121, // 33 ^ 9 0x747c7101, // 33 ^ 8 }; uvec32 kHashMul2 = { 0xec41d4e1, // 33 ^ 7 0x4cfa3cc1, // 33 ^ 6 0x025528a1, // 33 ^ 5 0x00121881, // 33 ^ 4 }; uvec32 kHashMul3 = { 0x00008c61, // 33 ^ 3 0x00000441, // 33 ^ 2 0x00000021, // 33 ^ 1 0x00000001, // 33 ^ 0 }; __declspec(naked) uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { __asm { mov eax, [esp + 4] // src mov ecx, [esp + 8] // count movd xmm0, [esp + 12] // seed pxor xmm7, xmm7 // constant 0 for unpck movdqa xmm6, xmmword ptr kHash16x33 wloop: movdqu xmm1, [eax] // src[0-15] lea eax, [eax + 16] pmulld xmm0, xmm6 // hash *= 33 ^ 16 movdqa xmm5, xmmword ptr kHashMul0 movdqa xmm2, xmm1 punpcklbw xmm2, xmm7 // src[0-7] movdqa xmm3, xmm2 punpcklwd xmm3, xmm7 // src[0-3] pmulld xmm3, xmm5 movdqa xmm5, xmmword ptr kHashMul1 movdqa xmm4, xmm2 punpckhwd xmm4, xmm7 // src[4-7] pmulld xmm4, xmm5 movdqa xmm5, xmmword ptr kHashMul2 punpckhbw xmm1, xmm7 // src[8-15] movdqa xmm2, xmm1 punpcklwd xmm2, xmm7 // src[8-11] pmulld xmm2, xmm5 movdqa xmm5, xmmword ptr kHashMul3 punpckhwd xmm1, xmm7 // src[12-15] pmulld xmm1, xmm5 paddd xmm3, xmm4 // add 16 results paddd xmm1, xmm2 paddd xmm1, xmm3 pshufd xmm2, xmm1, 0x0e // upper 2 dwords paddd xmm1, xmm2 pshufd xmm2, xmm1, 0x01 paddd xmm1, xmm2 paddd xmm0, xmm1 sub ecx, 16 jg wloop movd eax, xmm0 // return hash ret } } // Visual C 2012 required for AVX2. #ifdef HAS_HASHDJB2_AVX2 __declspec(naked) uint32_t HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { __asm { mov eax, [esp + 4] // src mov ecx, [esp + 8] // count vmovd xmm0, [esp + 12] // seed wloop: vpmovzxbd xmm3, [eax] // src[0-3] vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16 vpmovzxbd xmm4, [eax + 4] // src[4-7] vpmulld xmm3, xmm3, xmmword ptr kHashMul0 vpmovzxbd xmm2, [eax + 8] // src[8-11] vpmulld xmm4, xmm4, xmmword ptr kHashMul1 vpmovzxbd xmm1, [eax + 12] // src[12-15] vpmulld xmm2, xmm2, xmmword ptr kHashMul2 lea eax, [eax + 16] vpmulld xmm1, xmm1, xmmword ptr kHashMul3 vpaddd xmm3, xmm3, xmm4 // add 16 results vpaddd xmm1, xmm1, xmm2 vpaddd xmm1, xmm1, xmm3 vpshufd xmm2, xmm1, 0x0e // upper 2 dwords vpaddd xmm1, xmm1,xmm2 vpshufd xmm2, xmm1, 0x01 vpaddd xmm1, xmm1, xmm2 vpaddd xmm0, xmm0, xmm1 sub ecx, 16 jg wloop vmovd eax, xmm0 // return hash vzeroupper ret } } #endif // HAS_HASHDJB2_AVX2 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/convert.cc000066400000000000000000002744261416500237200210340ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/convert.h" #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" #include "libyuv/row.h" #include "libyuv/scale.h" // For ScalePlane() #include "libyuv/scale_uv.h" // For UVScale() #ifdef __cplusplus namespace libyuv { extern "C" { #endif #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) static __inline int Abs(int v) { return v >= 0 ? v : -v; } // Any I4xx To I420 format with mirroring. static int I4xxToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int src_y_width, int src_y_height, int src_uv_width, int src_uv_height) { const int dst_y_width = Abs(src_y_width); const int dst_y_height = Abs(src_y_height); const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); if (src_uv_width <= 0 || src_uv_height == 0) { return -1; } if (dst_y) { ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); } ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); return 0; } // Copy I420 with optional flipping. // TODO(fbarchard): Use Scale plane which supports mirroring, but ensure // is does row coalescing. LIBYUV_API int I420Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (halfheight - 1) * src_stride_u; src_v = src_v + (halfheight - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } // Copy UV planes. CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; } // Copy I010 with optional flipping. LIBYUV_API int I010Copy(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (halfheight - 1) * src_stride_u; src_v = src_v + (halfheight - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } if (dst_y) { CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } // Copy UV planes. CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; } static int Planar16bitTo8bit(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, int subsample_x, int subsample_y, int depth) { int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); int scale = 1 << (24 - depth); if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; uv_height = -uv_height; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (uv_height - 1) * src_stride_u; src_v = src_v + (uv_height - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } // Convert Y plane. Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale, width, height); // Convert UV planes. Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale, uv_width, uv_height); Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale, uv_width, uv_height); return 0; } // Convert 10 bit YUV to 8 bit. LIBYUV_API int I010ToI420(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, 1, 1, 10); } LIBYUV_API int I210ToI422(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, 1, 0, 10); } LIBYUV_API int I410ToI444(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, 0, 0, 10); } LIBYUV_API int I012ToI420(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, 1, 1, 12); } LIBYUV_API int I212ToI422(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, 1, 0, 12); } LIBYUV_API int I412ToI444(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { return Planar16bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, 0, 0, 12); } // Any Ix10 To I010 format with mirroring. static int Ix10ToI010(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height, int subsample_x, int subsample_y) { const int dst_y_width = Abs(width); const int dst_y_height = Abs(height); const int src_uv_width = SUBSAMPLE(width, subsample_x, subsample_x); const int src_uv_height = SUBSAMPLE(height, subsample_y, subsample_y); const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1); const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1); if (width <= 0 || height == 0) { return -1; } if (dst_y) { ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); } ScalePlane_12(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); ScalePlane_12(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); return 0; } LIBYUV_API int I410ToI010(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height) { return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, 0, 0); } LIBYUV_API int I210ToI010(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height) { return Ix10ToI010(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, 1, 0); } // Any I[420]1[02] to P[420]1[02] format with mirroring. static int IxxxToPxxx(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height, int subsample_x, int subsample_y, int depth) { const int uv_width = SUBSAMPLE(width, subsample_x, subsample_x); const int uv_height = SUBSAMPLE(height, subsample_y, subsample_y); if (width <= 0 || height == 0) { return -1; } ConvertToMSBPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height, depth); MergeUVPlane_16(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv, uv_width, uv_height, depth); return 0; } LIBYUV_API int I010ToP010(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height) { return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 1, 10); } LIBYUV_API int I210ToP210(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height) { return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 0, 10); } LIBYUV_API int I012ToP012(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height) { return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 1, 12); } LIBYUV_API int I212ToP212(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height) { return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 0, 12); } // 422 chroma is 1/2 width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API int I422ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { const int src_uv_width = SUBSAMPLE(width, 1, 1); return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, src_uv_width, height); } // TODO(fbarchard): Implement row conversion. LIBYUV_API int I422ToNV21(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (height - 1) * src_stride_u; src_v = src_v + (height - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } // Allocate u and v buffers align_buffer_64(plane_u, halfwidth * halfheight * 2); uint8_t* plane_v = plane_u + halfwidth * halfheight; I422ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width, height); MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu, halfwidth, halfheight); free_aligned_buffer_64(plane_u); return 0; } #ifdef I422TONV21_ROW_VERSION // Unittest fails for this version. // 422 chroma is 1/2 width, 1x height // 420 chroma is 1/2 width, 1/2 height // Swap src_u and src_v to implement I422ToNV12 LIBYUV_API int I422ToNV21(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height) { int y; void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) = MergeUVRow_C; void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_u || !src_v || !dst_vu || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (halfheight - 1) * src_stride_u; src_v = src_v + (halfheight - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_SSE2; } } #endif #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { MergeUVRow = MergeUVRow_AVX2; } } #endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow = MergeUVRow_Any_NEON; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_NEON; } } #endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow = MergeUVRow_Any_MMI; if (IS_ALIGNED(halfwidth, 8)) { MergeUVRow = MergeUVRow_MMI; } } #endif #if defined(HAS_MERGEUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MergeUVRow = MergeUVRow_Any_MSA; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow = MergeUVRow_MSA; } } #endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(width, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; if (IS_ALIGNED(width, 8)) { InterpolateRow = InterpolateRow_MMI; } } #endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; if (IS_ALIGNED(width, 32)) { InterpolateRow = InterpolateRow_MSA; } } #endif if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, halfwidth, height); } { // Allocate 2 rows of vu. int awidth = halfwidth * 2; align_buffer_64(row_vu_0, awidth * 2); uint8_t* row_vu_1 = row_vu_0 + awidth; for (y = 0; y < height - 1; y += 2) { MergeUVRow(src_v, src_u, row_vu_0, halfwidth); MergeUVRow(src_v + src_stride_v, src_u + src_stride_u, row_vu_1, halfwidth); InterpolateRow(dst_vu, row_vu_0, awidth, awidth, 128); src_u += src_stride_u * 2; src_v += src_stride_v * 2; dst_vu += dst_stride_vu; } if (height & 1) { MergeUVRow(src_v, src_u, dst_vu, halfwidth); } free_aligned_buffer_64(row_vu_0); } return 0; } #endif // I422TONV21_ROW_VERSION // 444 chroma is 1x width, 1x height // 420 chroma is 1/2 width, 1/2 height LIBYUV_API int I444ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, width, height); } LIBYUV_API int I444ToNV12(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (height - 1) * src_stride_u; src_v = src_v + (height - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv, width, height); return 0; } LIBYUV_API int I444ToNV21(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height) { return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, width, height); } // I400 is greyscale typically used in MJPG LIBYUV_API int I400ToI420(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128); SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128); return 0; } // I400 is greyscale typically used in MJPG LIBYUV_API int I400ToNV21(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!dst_vu || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } SetPlane(dst_vu, dst_stride_vu, halfwidth * 2, halfheight, 128); return 0; } // Convert NV12 to I420. // TODO(fbarchard): Consider inverting destination. Faster on ARM with prfm. LIBYUV_API int NV12ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_uv = src_uv + (halfheight - 1) * src_stride_uv; src_stride_y = -src_stride_y; src_stride_uv = -src_stride_uv; } // Coalesce rows. if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; } // Coalesce rows. if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth && dst_stride_v == halfwidth) { halfwidth *= halfheight; halfheight = 1; src_stride_uv = dst_stride_u = dst_stride_v = 0; } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } // Split UV plane - NV12 / NV21 SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, halfwidth, halfheight); return 0; } // Convert NV21 to I420. Same as NV12 but u and v pointers swapped. LIBYUV_API int NV21ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u, width, height); } LIBYUV_API int NV12ToNV24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { if (width <= 0 || height == 0) { return -1; } if (dst_y) { ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, Abs(width), Abs(height), kFilterBilinear); } UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); return 0; } LIBYUV_API int NV16ToNV24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { if (width <= 0 || height == 0) { return -1; } if (dst_y) { ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, Abs(width), Abs(height), kFilterBilinear); } UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv, dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); return 0; } LIBYUV_API int P010ToP410(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height) { if (width <= 0 || height == 0) { return -1; } if (dst_y) { ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y, Abs(width), Abs(height), kFilterBilinear); } UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); return 0; } LIBYUV_API int P210ToP410(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_uv, int dst_stride_uv, int width, int height) { if (width <= 0 || height == 0) { return -1; } if (dst_y) { ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y, Abs(width), Abs(height), kFilterBilinear); } UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv, dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); return 0; } // Convert YUY2 to I420. LIBYUV_API int YUY2ToI420(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) = YUY2ToUVRow_C; void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = YUY2ToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } #if defined(HAS_YUY2TOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { YUY2ToUVRow = YUY2ToUVRow_SSE2; YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif #if defined(HAS_YUY2TOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { YUY2ToUVRow = YUY2ToUVRow_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { YUY2ToUVRow = YUY2ToUVRow_AVX2; YUY2ToYRow = YUY2ToYRow_AVX2; } } #endif #if defined(HAS_YUY2TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { YUY2ToYRow = YUY2ToYRow_Any_NEON; YUY2ToUVRow = YUY2ToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { YUY2ToYRow = YUY2ToYRow_NEON; YUY2ToUVRow = YUY2ToUVRow_NEON; } } #endif #if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { YUY2ToYRow = YUY2ToYRow_Any_MMI; YUY2ToUVRow = YUY2ToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { YUY2ToYRow = YUY2ToYRow_MMI; if (IS_ALIGNED(width, 16)) { YUY2ToUVRow = YUY2ToUVRow_MMI; } } } #endif #if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { YUY2ToYRow = YUY2ToYRow_Any_MSA; YUY2ToUVRow = YUY2ToUVRow_Any_MSA; if (IS_ALIGNED(width, 32)) { YUY2ToYRow = YUY2ToYRow_MSA; YUY2ToUVRow = YUY2ToUVRow_MSA; } } #endif for (y = 0; y < height - 1; y += 2) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); YUY2ToYRow(src_yuy2, dst_y, width); YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width); src_yuy2 += src_stride_yuy2 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width); YUY2ToYRow(src_yuy2, dst_y, width); } return 0; } // Convert UYVY to I420. LIBYUV_API int UYVYToI420(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) = UYVYToUVRow_C; void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = UYVYToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } #if defined(HAS_UYVYTOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { UYVYToUVRow = UYVYToUVRow_Any_SSE2; UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { UYVYToUVRow = UYVYToUVRow_SSE2; UYVYToYRow = UYVYToYRow_SSE2; } } #endif #if defined(HAS_UYVYTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { UYVYToUVRow = UYVYToUVRow_Any_AVX2; UYVYToYRow = UYVYToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { UYVYToUVRow = UYVYToUVRow_AVX2; UYVYToYRow = UYVYToYRow_AVX2; } } #endif #if defined(HAS_UYVYTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { UYVYToYRow = UYVYToYRow_Any_NEON; UYVYToUVRow = UYVYToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_NEON; UYVYToUVRow = UYVYToUVRow_NEON; } } #endif #if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { UYVYToYRow = UYVYToYRow_Any_MMI; UYVYToUVRow = UYVYToUVRow_Any_MMI; if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_MMI; UYVYToUVRow = UYVYToUVRow_MMI; } } #endif #if defined(HAS_UYVYTOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { UYVYToYRow = UYVYToYRow_Any_MSA; UYVYToUVRow = UYVYToUVRow_Any_MSA; if (IS_ALIGNED(width, 32)) { UYVYToYRow = UYVYToYRow_MSA; UYVYToUVRow = UYVYToUVRow_MSA; } } #endif for (y = 0; y < height - 1; y += 2) { UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); UYVYToYRow(src_uyvy, dst_y, width); UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width); src_uyvy += src_stride_uyvy * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width); UYVYToYRow(src_uyvy, dst_y, width); } return 0; } // Convert AYUV to NV12. LIBYUV_API int AYUVToNV12(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width) = AYUVToUVRow_C; void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = AYUVToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv; src_stride_ayuv = -src_stride_ayuv; } // place holders for future intel code #if defined(HAS_AYUVTOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { AYUVToUVRow = AYUVToUVRow_Any_SSE2; AYUVToYRow = AYUVToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { AYUVToUVRow = AYUVToUVRow_SSE2; AYUVToYRow = AYUVToYRow_SSE2; } } #endif #if defined(HAS_AYUVTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { AYUVToUVRow = AYUVToUVRow_Any_AVX2; AYUVToYRow = AYUVToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { AYUVToUVRow = AYUVToUVRow_AVX2; AYUVToYRow = AYUVToYRow_AVX2; } } #endif #if defined(HAS_AYUVTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { AYUVToYRow = AYUVToYRow_Any_NEON; AYUVToUVRow = AYUVToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { AYUVToYRow = AYUVToYRow_NEON; AYUVToUVRow = AYUVToUVRow_NEON; } } #endif for (y = 0; y < height - 1; y += 2) { AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width); AYUVToYRow(src_ayuv, dst_y, width); AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width); src_ayuv += src_stride_ayuv * 2; dst_y += dst_stride_y * 2; dst_uv += dst_stride_uv; } if (height & 1) { AYUVToUVRow(src_ayuv, 0, dst_uv, width); AYUVToYRow(src_ayuv, dst_y, width); } return 0; } // Convert AYUV to NV21. LIBYUV_API int AYUVToNV21(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height) { int y; void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_vu, int width) = AYUVToVURow_C; void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = AYUVToYRow_C; // Negative height means invert the image. if (height < 0) { height = -height; src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv; src_stride_ayuv = -src_stride_ayuv; } // place holders for future intel code #if defined(HAS_AYUVTOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { AYUVToVURow = AYUVToVURow_Any_SSE2; AYUVToYRow = AYUVToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { AYUVToVURow = AYUVToVURow_SSE2; AYUVToYRow = AYUVToYRow_SSE2; } } #endif #if defined(HAS_AYUVTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { AYUVToVURow = AYUVToVURow_Any_AVX2; AYUVToYRow = AYUVToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { AYUVToVURow = AYUVToVURow_AVX2; AYUVToYRow = AYUVToYRow_AVX2; } } #endif #if defined(HAS_AYUVTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { AYUVToYRow = AYUVToYRow_Any_NEON; AYUVToVURow = AYUVToVURow_Any_NEON; if (IS_ALIGNED(width, 16)) { AYUVToYRow = AYUVToYRow_NEON; AYUVToVURow = AYUVToVURow_NEON; } } #endif for (y = 0; y < height - 1; y += 2) { AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width); AYUVToYRow(src_ayuv, dst_y, width); AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width); src_ayuv += src_stride_ayuv * 2; dst_y += dst_stride_y * 2; dst_vu += dst_stride_vu; } if (height & 1) { AYUVToVURow(src_ayuv, 0, dst_vu, width); AYUVToYRow(src_ayuv, dst_y, width); } return 0; } // Convert ARGB to I420. LIBYUV_API int ARGBToI420(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } } #endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif #if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_MMI; } } #endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_MSA; } } #endif for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); src_argb += src_stride_argb * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); } return 0; } // Convert BGRA to I420. LIBYUV_API int BGRAToI420(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) = BGRAToUVRow_C; void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) = BGRAToYRow_C; if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_bgra = src_bgra + (height - 1) * src_stride_bgra; src_stride_bgra = -src_stride_bgra; } #if defined(HAS_BGRATOYROW_SSSE3) && defined(HAS_BGRATOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { BGRAToUVRow = BGRAToUVRow_Any_SSSE3; BGRAToYRow = BGRAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { BGRAToUVRow = BGRAToUVRow_SSSE3; BGRAToYRow = BGRAToYRow_SSSE3; } } #endif #if defined(HAS_BGRATOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { BGRAToYRow = BGRAToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { BGRAToYRow = BGRAToYRow_NEON; } } #endif #if defined(HAS_BGRATOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { BGRAToUVRow = BGRAToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { BGRAToUVRow = BGRAToUVRow_NEON; } } #endif #if defined(HAS_BGRATOYROW_MMI) && defined(HAS_BGRATOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { BGRAToYRow = BGRAToYRow_Any_MMI; BGRAToUVRow = BGRAToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { BGRAToYRow = BGRAToYRow_MMI; } if (IS_ALIGNED(width, 16)) { BGRAToUVRow = BGRAToUVRow_MMI; } } #endif #if defined(HAS_BGRATOYROW_MSA) && defined(HAS_BGRATOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { BGRAToYRow = BGRAToYRow_Any_MSA; BGRAToUVRow = BGRAToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { BGRAToYRow = BGRAToYRow_MSA; BGRAToUVRow = BGRAToUVRow_MSA; } } #endif for (y = 0; y < height - 1; y += 2) { BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width); BGRAToYRow(src_bgra, dst_y, width); BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width); src_bgra += src_stride_bgra * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width); BGRAToYRow(src_bgra, dst_y, width); } return 0; } // Convert ABGR to I420. LIBYUV_API int ABGRToI420(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) = ABGRToUVRow_C; void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = ABGRToYRow_C; if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } #if defined(HAS_ABGRTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_SSSE3; } } #endif #if defined(HAS_ABGRTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ABGRToUVRow = ABGRToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ABGRToUVRow = ABGRToUVRow_SSSE3; } } #endif #if defined(HAS_ABGRTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ABGRToYRow = ABGRToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ABGRToYRow = ABGRToYRow_AVX2; } } #endif #if defined(HAS_ABGRTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ABGRToUVRow = ABGRToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ABGRToUVRow = ABGRToUVRow_AVX2; } } #endif #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_NEON; } } #endif #if defined(HAS_ABGRTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToUVRow = ABGRToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ABGRToUVRow = ABGRToUVRow_NEON; } } #endif #if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ABGRToYRow = ABGRToYRow_Any_MMI; ABGRToUVRow = ABGRToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_MMI; } if (IS_ALIGNED(width, 16)) { ABGRToUVRow = ABGRToUVRow_MMI; } } #endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; ABGRToUVRow = ABGRToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_MSA; ABGRToUVRow = ABGRToUVRow_MSA; } } #endif for (y = 0; y < height - 1; y += 2) { ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width); ABGRToYRow(src_abgr, dst_y, width); ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); src_abgr += src_stride_abgr * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width); ABGRToYRow(src_abgr, dst_y, width); } return 0; } // Convert RGBA to I420. LIBYUV_API int RGBAToI420(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) = RGBAToUVRow_C; void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) = RGBAToYRow_C; if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_rgba = src_rgba + (height - 1) * src_stride_rgba; src_stride_rgba = -src_stride_rgba; } #if defined(HAS_RGBATOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGBAToYRow = RGBAToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGBAToYRow = RGBAToYRow_SSSE3; } } #endif #if defined(HAS_RGBATOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGBAToUVRow = RGBAToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGBAToUVRow = RGBAToUVRow_SSSE3; } } #endif #if defined(HAS_RGBATOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGBAToYRow = RGBAToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGBAToYRow = RGBAToYRow_NEON; } } #endif #if defined(HAS_RGBATOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGBAToUVRow = RGBAToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { RGBAToUVRow = RGBAToUVRow_NEON; } } #endif #if defined(HAS_RGBATOYROW_MMI) && defined(HAS_RGBATOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGBAToYRow = RGBAToYRow_Any_MMI; RGBAToUVRow = RGBAToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { RGBAToYRow = RGBAToYRow_MMI; } if (IS_ALIGNED(width, 16)) { RGBAToUVRow = RGBAToUVRow_MMI; } } #endif #if defined(HAS_RGBATOYROW_MSA) && defined(HAS_RGBATOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGBAToYRow = RGBAToYRow_Any_MSA; RGBAToUVRow = RGBAToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RGBAToYRow = RGBAToYRow_MSA; RGBAToUVRow = RGBAToUVRow_MSA; } } #endif for (y = 0; y < height - 1; y += 2) { RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width); RGBAToYRow(src_rgba, dst_y, width); RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width); src_rgba += src_stride_rgba * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width); RGBAToYRow(src_rgba, dst_y, width); } return 0; } // Enabled if 1 pass is available #if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \ defined(HAS_RGB24TOYROW_MMI)) #define HAS_RGB24TOYROW #endif // Convert RGB24 to I420. LIBYUV_API int RGB24ToI420(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; #if defined(HAS_RGB24TOYROW) void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width) = RGB24ToUVRow_C; void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = RGB24ToYRow_C; #else void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RGB24ToARGBRow_C; void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; src_stride_rgb24 = -src_stride_rgb24; } #if defined(HAS_RGB24TOYROW) // Neon version does direct RGB24 to YUV. #if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToUVRow = RGB24ToUVRow_Any_NEON; RGB24ToYRow = RGB24ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB24ToYRow = RGB24ToYRow_NEON; if (IS_ALIGNED(width, 16)) { RGB24ToUVRow = RGB24ToUVRow_NEON; } } } #endif #if defined(HAS_RGB24TOYROW_MMI) && defined(HAS_RGB24TOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB24ToUVRow = RGB24ToUVRow_Any_MMI; RGB24ToYRow = RGB24ToYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { RGB24ToYRow = RGB24ToYRow_MMI; if (IS_ALIGNED(width, 16)) { RGB24ToUVRow = RGB24ToUVRow_MMI; } } } #endif #if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB24ToUVRow = RGB24ToUVRow_Any_MSA; RGB24ToYRow = RGB24ToYRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RGB24ToYRow = RGB24ToYRow_MSA; RGB24ToUVRow = RGB24ToUVRow_MSA; } } #endif // Other platforms do intermediate conversion from RGB24 to ARGB. #else // HAS_RGB24TOYROW #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif #endif // HAS_RGB24TOYROW { #if !defined(HAS_RGB24TOYROW) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RGB24TOYROW) RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); #else RGB24ToARGBRow(src_rgb24, row, width); RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif src_rgb24 += src_stride_rgb24 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { #if defined(HAS_RGB24TOYROW) RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); #else RGB24ToARGBRow(src_rgb24, row, width); ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); #endif } #if !defined(HAS_RGB24TOYROW) free_aligned_buffer_64(row); #endif } return 0; } #undef HAS_RGB24TOYROW // Enabled if 1 pass is available #if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \ defined(HAS_RGB24TOYJROW_MMI)) #define HAS_RGB24TOYJROW #endif // Convert RGB24 to J420. LIBYUV_API int RGB24ToJ420(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; #if defined(HAS_RGB24TOYJROW) void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width) = RGB24ToUVJRow_C; void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) = RGB24ToYJRow_C; #else void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RGB24ToARGBRow_C; void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYJRow_C; #endif if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; src_stride_rgb24 = -src_stride_rgb24; } #if defined(HAS_RGB24TOYJROW) // Neon version does direct RGB24 to YUV. #if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON; RGB24ToYJRow = RGB24ToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB24ToYJRow = RGB24ToYJRow_NEON; if (IS_ALIGNED(width, 16)) { RGB24ToUVJRow = RGB24ToUVJRow_NEON; } } } #endif #if defined(HAS_RGB24TOYJROW_MMI) && defined(HAS_RGB24TOUVJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI; RGB24ToYJRow = RGB24ToYJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { RGB24ToYJRow = RGB24ToYJRow_MMI; if (IS_ALIGNED(width, 16)) { RGB24ToUVJRow = RGB24ToUVJRow_MMI; } } } #endif #if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA; RGB24ToYJRow = RGB24ToYJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RGB24ToYJRow = RGB24ToYJRow_MSA; RGB24ToUVJRow = RGB24ToUVJRow_MSA; } } #endif // Other platforms do intermediate conversion from RGB24 to ARGB. #else // HAS_RGB24TOYJROW #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYJRow = ARGBToYJRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVJRow = ARGBToUVJRow_AVX2; } } #endif #endif // HAS_RGB24TOYJROW { #if !defined(HAS_RGB24TOYJROW) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RGB24TOYJROW) RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToYJRow(src_rgb24, dst_y, width); RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); #else RGB24ToARGBRow(src_rgb24, row, width); RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width); ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width); ARGBToYJRow(row, dst_y, width); ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); #endif src_rgb24 += src_stride_rgb24 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { #if defined(HAS_RGB24TOYJROW) RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width); RGB24ToYJRow(src_rgb24, dst_y, width); #else RGB24ToARGBRow(src_rgb24, row, width); ARGBToUVJRow(row, 0, dst_u, dst_v, width); ARGBToYJRow(row, dst_y, width); #endif } #if !defined(HAS_RGB24TOYJROW) free_aligned_buffer_64(row); #endif } return 0; } #undef HAS_RGB24TOYJROW // Enabled if 1 pass is available #if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \ defined(HAS_RAWTOYROW_MMI)) #define HAS_RAWTOYROW #endif // Convert RAW to I420. LIBYUV_API int RAWToI420(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; #if defined(HAS_RAWTOYROW) void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width) = RAWToUVRow_C; void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = RAWToYRow_C; #else void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_raw = src_raw + (height - 1) * src_stride_raw; src_stride_raw = -src_stride_raw; } #if defined(HAS_RAWTOYROW) // Neon version does direct RAW to YUV. #if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToUVRow = RAWToUVRow_Any_NEON; RAWToYRow = RAWToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RAWToYRow = RAWToYRow_NEON; if (IS_ALIGNED(width, 16)) { RAWToUVRow = RAWToUVRow_NEON; } } } #endif #if defined(HAS_RAWTOYROW_MMI) && defined(HAS_RAWTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RAWToUVRow = RAWToUVRow_Any_MMI; RAWToYRow = RAWToYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { RAWToYRow = RAWToYRow_MMI; if (IS_ALIGNED(width, 16)) { RAWToUVRow = RAWToUVRow_MMI; } } } #endif #if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RAWToUVRow = RAWToUVRow_Any_MSA; RAWToYRow = RAWToYRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RAWToYRow = RAWToYRow_MSA; RAWToUVRow = RAWToUVRow_MSA; } } #endif // Other platforms do intermediate conversion from RAW to ARGB. #else // HAS_RAWTOYROW #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif #endif // HAS_RAWTOYROW { #if !defined(HAS_RAWTOYROW) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RAWTOYROW) RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); #else RAWToARGBRow(src_raw, row, width); RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif src_raw += src_stride_raw * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { #if defined(HAS_RAWTOYROW) RAWToUVRow(src_raw, 0, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); #else RAWToARGBRow(src_raw, row, width); ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); #endif } #if !defined(HAS_RAWTOYROW) free_aligned_buffer_64(row); #endif } return 0; } #undef HAS_RAWTOYROW // Enabled if 1 pass is available #if (defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \ defined(HAS_RAWTOYJROW_MMI)) #define HAS_RAWTOYJROW #endif // Convert RAW to J420. LIBYUV_API int RAWToJ420(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; #if defined(HAS_RAWTOYJROW) void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width) = RAWToUVJRow_C; void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = RAWToYJRow_C; #else void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYJRow_C; #endif if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_raw = src_raw + (height - 1) * src_stride_raw; src_stride_raw = -src_stride_raw; } #if defined(HAS_RAWTOYJROW) // Neon version does direct RAW to YUV. #if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToUVJRow = RAWToUVJRow_Any_NEON; RAWToYJRow = RAWToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RAWToYJRow = RAWToYJRow_NEON; if (IS_ALIGNED(width, 16)) { RAWToUVJRow = RAWToUVJRow_NEON; } } } #endif #if defined(HAS_RAWTOYJROW_MMI) && defined(HAS_RAWTOUVJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RAWToUVJRow = RAWToUVJRow_Any_MMI; RAWToYJRow = RAWToYJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { RAWToYJRow = RAWToYJRow_MMI; if (IS_ALIGNED(width, 16)) { RAWToUVJRow = RAWToUVJRow_MMI; } } } #endif #if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RAWToUVJRow = RAWToUVJRow_Any_MSA; RAWToYJRow = RAWToYJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RAWToYJRow = RAWToYJRow_MSA; RAWToUVJRow = RAWToUVJRow_MSA; } } #endif // Other platforms do intermediate conversion from RAW to ARGB. #else // HAS_RAWTOYJROW #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYJRow = ARGBToYJRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVJRow = ARGBToUVJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVJRow = ARGBToUVJRow_AVX2; } } #endif #endif // HAS_RAWTOYJROW { #if !defined(HAS_RAWTOYJROW) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { #if defined(HAS_RAWTOYJROW) RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width); RAWToYJRow(src_raw, dst_y, width); RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); #else RAWToARGBRow(src_raw, row, width); RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width); ARGBToYJRow(row, dst_y, width); ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width); #endif src_raw += src_stride_raw * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { #if defined(HAS_RAWTOYJROW) RAWToUVJRow(src_raw, 0, dst_u, dst_v, width); RAWToYJRow(src_raw, dst_y, width); #else RAWToARGBRow(src_raw, row, width); ARGBToUVJRow(row, 0, dst_u, dst_v, width); ARGBToYJRow(row, dst_y, width); #endif } #if !defined(HAS_RAWTOYJROW) free_aligned_buffer_64(row); #endif } return 0; } #undef HAS_RAWTOYJROW // Convert RGB565 to I420. LIBYUV_API int RGB565ToI420(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ defined(HAS_RGB565TOYROW_MMI)) void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width) = RGB565ToUVRow_C; void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) = RGB565ToYRow_C; #else void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RGB565ToARGBRow_C; void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; src_stride_rgb565 = -src_stride_rgb565; } // Neon version does direct RGB565 to YUV. #if defined(HAS_RGB565TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB565ToUVRow = RGB565ToUVRow_Any_NEON; RGB565ToYRow = RGB565ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB565ToYRow = RGB565ToYRow_NEON; if (IS_ALIGNED(width, 16)) { RGB565ToUVRow = RGB565ToUVRow_NEON; } } } // MMI and MSA version does direct RGB565 to YUV. #elif (defined(HAS_RGB565TOYROW_MMI) || defined(HAS_RGB565TOYROW_MSA)) #if defined(HAS_RGB565TOYROW_MMI) && defined(HAS_RGB565TOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB565ToUVRow = RGB565ToUVRow_Any_MMI; RGB565ToYRow = RGB565ToYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { RGB565ToYRow = RGB565ToYRow_MMI; if (IS_ALIGNED(width, 16)) { RGB565ToUVRow = RGB565ToUVRow_MMI; } } } #endif #if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB565ToUVRow = RGB565ToUVRow_Any_MSA; RGB565ToYRow = RGB565ToYRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RGB565ToYRow = RGB565ToYRow_MSA; RGB565ToUVRow = RGB565ToUVRow_MSA; } } #endif // Other platforms do intermediate conversion from RGB565 to ARGB. #else #if defined(HAS_RGB565TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { RGB565ToARGBRow = RGB565ToARGBRow_SSE2; } } #endif #if defined(HAS_RGB565TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { RGB565ToARGBRow = RGB565ToARGBRow_AVX2; } } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif #endif { #if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ defined(HAS_RGB565TOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ defined(HAS_RGB565TOYROW_MMI)) RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width); #else RGB565ToARGBRow(src_rgb565, row, width); RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width); ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif src_rgb565 += src_stride_rgb565 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { #if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ defined(HAS_RGB565TOYROW_MMI)) RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width); RGB565ToYRow(src_rgb565, dst_y, width); #else RGB565ToARGBRow(src_rgb565, row, width); ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); #endif } #if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \ defined(HAS_RGB565TOYROW_MMI)) free_aligned_buffer_64(row); #endif } return 0; } // Convert ARGB1555 to I420. LIBYUV_API int ARGB1555ToI420(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; #if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ defined(HAS_ARGB1555TOYROW_MMI)) void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGB1555ToUVRow_C; void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y, int width) = ARGB1555ToYRow_C; #else void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = ARGB1555ToARGBRow_C; void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; src_stride_argb1555 = -src_stride_argb1555; } // Neon version does direct ARGB1555 to YUV. #if defined(HAS_ARGB1555TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB1555ToYRow = ARGB1555ToYRow_NEON; if (IS_ALIGNED(width, 16)) { ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; } } } // MMI and MSA version does direct ARGB1555 to YUV. #elif (defined(HAS_ARGB1555TOYROW_MMI) || defined(HAS_ARGB1555TOYROW_MSA)) #if defined(HAS_ARGB1555TOYROW_MMI) && defined(HAS_ARGB1555TOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MMI; ARGB1555ToYRow = ARGB1555ToYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGB1555ToYRow = ARGB1555ToYRow_MMI; if (IS_ALIGNED(width, 16)) { ARGB1555ToUVRow = ARGB1555ToUVRow_MMI; } } } #endif #if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA; ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGB1555ToYRow = ARGB1555ToYRow_MSA; ARGB1555ToUVRow = ARGB1555ToUVRow_MSA; } } #endif // Other platforms do intermediate conversion from ARGB1555 to ARGB. #else #if defined(HAS_ARGB1555TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; } } #endif #if defined(HAS_ARGB1555TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; } } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif #endif { #if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ defined(HAS_ARGB1555TOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { #if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ defined(HAS_ARGB1555TOYROW_MMI)) ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, width); #else ARGB1555ToARGBRow(src_argb1555, row, width); ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize, width); ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif src_argb1555 += src_stride_argb1555 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { #if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ defined(HAS_ARGB1555TOYROW_MMI)) ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); #else ARGB1555ToARGBRow(src_argb1555, row, width); ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); #endif } #if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \ defined(HAS_ARGB1555TOYROW_MMI)) free_aligned_buffer_64(row); #endif } return 0; } // Convert ARGB4444 to I420. LIBYUV_API int ARGB4444ToI420(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; #if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGB4444ToUVRow_C; void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y, int width) = ARGB4444ToYRow_C; #else void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = ARGB4444ToARGBRow_C; void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; #endif if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; src_stride_argb4444 = -src_stride_argb4444; } // Neon version does direct ARGB4444 to YUV. #if defined(HAS_ARGB4444TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB4444ToYRow = ARGB4444ToYRow_NEON; if (IS_ALIGNED(width, 16)) { ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; } } } #elif defined(HAS_ARGB4444TOYROW_MMI) && defined(HAS_ARGB4444TOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGB4444ToUVRow = ARGB4444ToUVRow_Any_MMI; ARGB4444ToYRow = ARGB4444ToYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGB4444ToYRow = ARGB4444ToYRow_MMI; if (IS_ALIGNED(width, 16)) { ARGB4444ToUVRow = ARGB4444ToUVRow_MMI; } } } // Other platforms do intermediate conversion from ARGB4444 to ARGB. #else #if defined(HAS_ARGB4444TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; } } #endif #if defined(HAS_ARGB4444TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; } } #endif #if defined(HAS_ARGB4444TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; } } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif #if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToUVRow = ARGBToUVRow_Any_MMI; ARGBToYRow = ARGBToYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_MMI; } } } #endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToUVRow = ARGBToUVRow_Any_MSA; ARGBToYRow = ARGBToYRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_MSA; } } } #endif #endif { #if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) // Allocate 2 rows of ARGB. const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif for (y = 0; y < height - 1; y += 2) { #if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); ARGB4444ToYRow(src_argb4444, dst_y, width); ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, width); #else ARGB4444ToARGBRow(src_argb4444, row, width); ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize, width); ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); #endif src_argb4444 += src_stride_argb4444 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { #if (defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); ARGB4444ToYRow(src_argb4444, dst_y, width); #else ARGB4444ToARGBRow(src_argb4444, row, width); ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); #endif } #if !(defined(HAS_ARGB4444TOYROW_NEON) || defined(HAS_ARGB4444TOYROW_MMI)) free_aligned_buffer_64(row); #endif } return 0; } // Convert RGB24 to J400. LIBYUV_API int RGB24ToJ400(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_yj, int dst_stride_yj, int width, int height) { int y; void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) = RGB24ToYJRow_C; if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; src_stride_rgb24 = -src_stride_rgb24; } // Coalesce rows. if (src_stride_rgb24 == width * 3 && dst_stride_yj == width) { width *= height; height = 1; src_stride_rgb24 = dst_stride_yj = 0; } #if defined(HAS_RGB24TOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToYJRow = RGB24ToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGB24ToYJRow = RGB24ToYJRow_SSSE3; } } #endif #if defined(HAS_RGB24TOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { RGB24ToYJRow = RGB24ToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { RGB24ToYJRow = RGB24ToYJRow_AVX2; } } #endif #if defined(HAS_RGB24TOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToYJRow = RGB24ToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB24ToYJRow = RGB24ToYJRow_NEON; } } #endif #if defined(HAS_RGB24TOYJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB24ToYJRow = RGB24ToYJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { RGB24ToYJRow = RGB24ToYJRow_MMI; } } #endif #if defined(HAS_RGB24TOYJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB24ToYJRow = RGB24ToYJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RGB24ToYJRow = RGB24ToYJRow_MSA; } } #endif for (y = 0; y < height; ++y) { RGB24ToYJRow(src_rgb24, dst_yj, width); src_rgb24 += src_stride_rgb24; dst_yj += dst_stride_yj; } return 0; } // Convert RAW to J400. LIBYUV_API int RAWToJ400(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_yj, int dst_stride_yj, int width, int height) { int y; void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) = RAWToYJRow_C; if (!src_raw || !dst_yj || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_raw = src_raw + (height - 1) * src_stride_raw; src_stride_raw = -src_stride_raw; } // Coalesce rows. if (src_stride_raw == width * 3 && dst_stride_yj == width) { width *= height; height = 1; src_stride_raw = dst_stride_yj = 0; } #if defined(HAS_RAWTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToYJRow = RAWToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RAWToYJRow = RAWToYJRow_SSSE3; } } #endif #if defined(HAS_RAWTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { RAWToYJRow = RAWToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { RAWToYJRow = RAWToYJRow_AVX2; } } #endif #if defined(HAS_RAWTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToYJRow = RAWToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RAWToYJRow = RAWToYJRow_NEON; } } #endif #if defined(HAS_RAWTOYJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RAWToYJRow = RAWToYJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { RAWToYJRow = RAWToYJRow_MMI; } } #endif #if defined(HAS_RAWTOYJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RAWToYJRow = RAWToYJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RAWToYJRow = RAWToYJRow_MSA; } } #endif for (y = 0; y < height; ++y) { RAWToYJRow(src_raw, dst_yj, width); src_raw += src_stride_raw; dst_yj += dst_stride_yj; } return 0; } // Convert Android420 to I420. LIBYUV_API int Android420ToI420(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { return Android420ToI420Rotate(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_pixel_stride_uv, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, kRotate0); } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/convert_argb.cc000066400000000000000000005101721416500237200220160ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/convert_argb.h" #include "libyuv/cpu_id.h" #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" #endif #include "libyuv/planar_functions.h" // For CopyPlane and ARGBShuffle. #include "libyuv/rotate_argb.h" #include "libyuv/row.h" #include "libyuv/video_common.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Copy ARGB with optional flipping LIBYUV_API int ARGBCopy(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4, height); return 0; } // Convert I420 to ARGB with matrix. LIBYUV_API int I420ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif #if defined(HAS_I422TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToARGBRow = I422ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToARGBRow = I422ToARGBRow_AVX2; } } #endif #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_NEON; } } #endif #if defined(HAS_I422TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToARGBRow = I422ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { I422ToARGBRow = I422ToARGBRow_MMI; } } #endif #if defined(HAS_I422TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGBRow = I422ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert I420 to ARGB. LIBYUV_API int I420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height); } // Convert I420 to ABGR. LIBYUV_API int I420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J420 to ARGB. LIBYUV_API int J420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvJPEGConstants, width, height); } // Convert J420 to ABGR. LIBYUV_API int J420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuJPEGConstants, // Use Yvu matrix width, height); } // Convert H420 to ARGB. LIBYUV_API int H420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvH709Constants, width, height); } // Convert H420 to ABGR. LIBYUV_API int H420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert U420 to ARGB. LIBYUV_API int U420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuv2020Constants, width, height); } // Convert U420 to ABGR. LIBYUV_API int U420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvu2020Constants, // Use Yvu matrix width, height); } // Convert I422 to ARGB with matrix. LIBYUV_API int I422ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } // Coalesce rows. if (src_stride_y == width && src_stride_u * 2 == width && src_stride_v * 2 == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } #if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif #if defined(HAS_I422TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToARGBRow = I422ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToARGBRow = I422ToARGBRow_AVX2; } } #endif #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_NEON; } } #endif #if defined(HAS_I422TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToARGBRow = I422ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { I422ToARGBRow = I422ToARGBRow_MMI; } } #endif #if defined(HAS_I422TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGBRow = I422ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } // Convert I422 to ARGB. LIBYUV_API int I422ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height); } // Convert I422 to ABGR. LIBYUV_API int I422ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I422ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J422 to ARGB. LIBYUV_API int J422ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvJPEGConstants, width, height); } // Convert J422 to ABGR. LIBYUV_API int J422ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I422ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuJPEGConstants, // Use Yvu matrix width, height); } // Convert H422 to ARGB. LIBYUV_API int H422ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvH709Constants, width, height); } // Convert H422 to ABGR. LIBYUV_API int H422ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I422ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert U422 to ARGB. LIBYUV_API int U422ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuv2020Constants, width, height); } // Convert U422 to ABGR. LIBYUV_API int U422ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I422ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvu2020Constants, // Use Yvu matrix width, height); } // Convert I444 to ARGB with matrix. LIBYUV_API int I444ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I444ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } // Coalesce rows. if (src_stride_y == width && src_stride_u == width && src_stride_v == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0; } #if defined(HAS_I444TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I444ToARGBRow = I444ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I444ToARGBRow = I444ToARGBRow_SSSE3; } } #endif #if defined(HAS_I444TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I444ToARGBRow = I444ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I444ToARGBRow = I444ToARGBRow_AVX2; } } #endif #if defined(HAS_I444TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I444ToARGBRow = I444ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I444ToARGBRow = I444ToARGBRow_NEON; } } #endif #if defined(HAS_I444TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I444ToARGBRow = I444ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { I444ToARGBRow = I444ToARGBRow_MMI; } } #endif #if defined(HAS_I444TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I444ToARGBRow = I444ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 8)) { I444ToARGBRow = I444ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } // Convert I444 to ARGB. LIBYUV_API int I444ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height); } // Convert I444 to ABGR. LIBYUV_API int I444ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I444ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J444 to ARGB. LIBYUV_API int J444ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvJPEGConstants, width, height); } // Convert J444 to ABGR. LIBYUV_API int J444ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I444ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuJPEGConstants, // Use Yvu matrix width, height); } // Convert H444 to ARGB. LIBYUV_API int H444ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvH709Constants, width, height); } // Convert H444 to ABGR. LIBYUV_API int H444ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I444ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert U444 to ARGB. LIBYUV_API int U444ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuv2020Constants, width, height); } // Convert U444 to ABGR. LIBYUV_API int U444ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I444ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvu2020Constants, // Use Yvu matrix width, height); } // Convert 10 bit YUV to ARGB with matrix. // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to // multiply 10 bit yuv into high bits to allow any number of bits. LIBYUV_API int I010ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I210ToAR30Row_C; if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } #if defined(HAS_I210TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I210ToAR30Row = I210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I210ToAR30Row = I210ToAR30Row_SSSE3; } } #endif #if defined(HAS_I210TOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I210ToAR30Row = I210ToAR30Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { I210ToAR30Row = I210ToAR30Row_AVX2; } } #endif for (y = 0; y < height; ++y) { I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert I010 to AR30. LIBYUV_API int I010ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height) { return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_ar30, dst_stride_ar30, &kYuvI601Constants, width, height); } // Convert H010 to AR30. LIBYUV_API int H010ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height) { return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_ar30, dst_stride_ar30, &kYuvH709Constants, width, height); } // Convert U010 to AR30. LIBYUV_API int U010ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height) { return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_ar30, dst_stride_ar30, &kYuv2020Constants, width, height); } // Convert I010 to AB30. LIBYUV_API int I010ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height) { return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, dst_ab30, dst_stride_ab30, &kYvuI601Constants, width, height); } // Convert H010 to AB30. LIBYUV_API int H010ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height) { return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, dst_ab30, dst_stride_ab30, &kYvuH709Constants, width, height); } // Convert U010 to AB30. LIBYUV_API int U010ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height) { return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, dst_ab30, dst_stride_ab30, &kYuv2020Constants, width, height); } // Convert 12 bit YUV to ARGB with matrix. // TODO(fbarchard): Consider passing scale multiplier to I212ToARGB to // multiply 12 bit yuv into high bits to allow any number of bits. LIBYUV_API int I012ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I212ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I212ToAR30Row_C; if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } #if defined(HAS_I212TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I212ToAR30Row = I212ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I212ToAR30Row = I212ToAR30Row_SSSE3; } } #endif #if defined(HAS_I212TOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I212ToAR30Row = I212ToAR30Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { I212ToAR30Row = I212ToAR30Row_AVX2; } } #endif for (y = 0; y < height; ++y) { I212ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert 10 bit YUV to ARGB with matrix. // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to // multiply 10 bit yuv into high bits to allow any number of bits. LIBYUV_API int I210ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I210ToAR30Row_C; if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } #if defined(HAS_I210TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I210ToAR30Row = I210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I210ToAR30Row = I210ToAR30Row_SSSE3; } } #endif #if defined(HAS_I210TOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I210ToAR30Row = I210ToAR30Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { I210ToAR30Row = I210ToAR30Row_AVX2; } } #endif for (y = 0; y < height; ++y) { I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } // Convert I210 to AR30. LIBYUV_API int I210ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height) { return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_ar30, dst_stride_ar30, &kYuvI601Constants, width, height); } // Convert H210 to AR30. LIBYUV_API int H210ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height) { return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_ar30, dst_stride_ar30, &kYuvH709Constants, width, height); } // Convert U210 to AR30. LIBYUV_API int U210ToAR30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height) { return I210ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_ar30, dst_stride_ar30, &kYuv2020Constants, width, height); } // Convert I210 to AB30. LIBYUV_API int I210ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height) { return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, dst_ab30, dst_stride_ab30, &kYvuI601Constants, width, height); } // Convert H210 to AB30. LIBYUV_API int H210ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height) { return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, dst_ab30, dst_stride_ab30, &kYvuH709Constants, width, height); } // Convert U210 to AB30. LIBYUV_API int U210ToAB30(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height) { return I210ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, dst_ab30, dst_stride_ab30, &kYuv2020Constants, width, height); } LIBYUV_API int I410ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I410ToAR30Row_C; if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } #if defined(HAS_I410TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I410ToAR30Row = I410ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I410ToAR30Row = I410ToAR30Row_SSSE3; } } #endif #if defined(HAS_I410TOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I410ToAR30Row = I410ToAR30Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { I410ToAR30Row = I410ToAR30Row_AVX2; } } #endif for (y = 0; y < height; ++y) { I410ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } // Convert 10 bit YUV to ARGB with matrix. LIBYUV_API int I010ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I210ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I210TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I210ToARGBRow = I210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I210ToARGBRow = I210ToARGBRow_SSSE3; } } #endif #if defined(HAS_I210TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I210ToARGBRow = I210ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I210ToARGBRow = I210ToARGBRow_AVX2; } } #endif for (y = 0; y < height; ++y) { I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert I010 to ARGB. LIBYUV_API int I010ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height); } // Convert I010 to ABGR. LIBYUV_API int I010ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I010ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert H010 to ARGB. LIBYUV_API int H010ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvH709Constants, width, height); } // Convert H010 to ABGR. LIBYUV_API int H010ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I010ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert U010 to ARGB. LIBYUV_API int U010ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuv2020Constants, width, height); } // Convert U010 to ABGR. LIBYUV_API int U010ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I010ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvu2020Constants, // Use Yvu matrix width, height); } // Convert 12 bit YUV to ARGB with matrix. LIBYUV_API int I012ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I212ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I212ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I212TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I212ToARGBRow = I212ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I212ToARGBRow = I212ToARGBRow_SSSE3; } } #endif #if defined(HAS_I212TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I212ToARGBRow = I212ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I212ToARGBRow = I212ToARGBRow_AVX2; } } #endif for (y = 0; y < height; ++y) { I212ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert 10 bit 422 YUV to ARGB with matrix. LIBYUV_API int I210ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I210ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I210TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I210ToARGBRow = I210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I210ToARGBRow = I210ToARGBRow_SSSE3; } } #endif #if defined(HAS_I210TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I210ToARGBRow = I210ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I210ToARGBRow = I210ToARGBRow_AVX2; } } #endif for (y = 0; y < height; ++y) { I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } // Convert I210 to ARGB. LIBYUV_API int I210ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height); } // Convert I210 to ABGR. LIBYUV_API int I210ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I210ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert H210 to ARGB. LIBYUV_API int H210ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuvH709Constants, width, height); } // Convert H210 to ABGR. LIBYUV_API int H210ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I210ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert U210 to ARGB. LIBYUV_API int U210ToARGB(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I210ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, &kYuv2020Constants, width, height); } // Convert U210 to ABGR. LIBYUV_API int U210ToABGR(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return I210ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_abgr, dst_stride_abgr, &kYvu2020Constants, // Use Yvu matrix width, height); } LIBYUV_API int I410ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I410ToARGBRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I410TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I410ToARGBRow = I410ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I410ToARGBRow = I410ToARGBRow_SSSE3; } } #endif #if defined(HAS_I410TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I410ToARGBRow = I410ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I410ToARGBRow = I410ToARGBRow_AVX2; } } #endif for (y = 0; y < height; ++y) { I410ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } LIBYUV_API int P010ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*P210ToARGBRow)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_P210TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { P210ToARGBRow = P210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { P210ToARGBRow = P210ToARGBRow_SSSE3; } } #endif #if defined(HAS_P210TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { P210ToARGBRow = P210ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { P210ToARGBRow = P210ToARGBRow_AVX2; } } #endif for (y = 0; y < height; ++y) { P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { src_uv += src_stride_uv; } } return 0; } LIBYUV_API int P210ToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*P210ToARGBRow)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P210ToARGBRow_C; if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_P210TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { P210ToARGBRow = P210ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { P210ToARGBRow = P210ToARGBRow_SSSE3; } } #endif #if defined(HAS_P210TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { P210ToARGBRow = P210ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { P210ToARGBRow = P210ToARGBRow_AVX2; } } #endif for (y = 0; y < height; ++y) { P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; src_uv += src_stride_uv; } return 0; } LIBYUV_API int P010ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*P210ToAR30Row)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } #if defined(HAS_P210TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { P210ToAR30Row = P210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { P210ToAR30Row = P210ToAR30Row_SSSE3; } } #endif #if defined(HAS_P210TOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { P210ToAR30Row = P210ToAR30Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { P210ToAR30Row = P210ToAR30Row_AVX2; } } #endif for (y = 0; y < height; ++y) { P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; if (y & 1) { src_uv += src_stride_uv; } } return 0; } LIBYUV_API int P210ToAR30Matrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_uv, int src_stride_uv, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*P210ToAR30Row)( const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = P210ToAR30Row_C; if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } #if defined(HAS_P210TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { P210ToAR30Row = P210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { P210ToAR30Row = P210ToAR30Row_SSSE3; } } #endif #if defined(HAS_P210TOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { P210ToAR30Row = P210ToAR30Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { P210ToAR30Row = P210ToAR30Row_AVX2; } } #endif for (y = 0; y < height; ++y) { P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; src_uv += src_stride_uv; } return 0; } // Convert I420 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I420AlphaToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate) { int y; void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = I422AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I422ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; } } #endif #if defined(HAS_I422ALPHATOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2; } } #endif #if defined(HAS_I422ALPHATOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422AlphaToARGBRow = I422AlphaToARGBRow_NEON; } } #endif #if defined(HAS_I422ALPHATOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { I422AlphaToARGBRow = I422AlphaToARGBRow_MMI; } } #endif #if defined(HAS_I422ALPHATOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; if (IS_ALIGNED(width, 8)) { I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; } } #endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif #if defined(HAS_ARGBATTENUATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_AVX2; } } #endif #if defined(HAS_ARGBATTENUATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_NEON; } } #endif #if defined(HAS_ARGBATTENUATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBAttenuateRow = ARGBAttenuateRow_MMI; } } #endif #if defined(HAS_ARGBATTENUATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_MSA; } } #endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, width); if (attenuate) { ARGBAttenuateRow(dst_argb, dst_argb, width); } dst_argb += dst_stride_argb; src_a += src_stride_a; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert I422 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I422AlphaToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate) { int y; void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = I422AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I422ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422AlphaToARGBRow = I422AlphaToARGBRow_SSSE3; } } #endif #if defined(HAS_I422ALPHATOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422AlphaToARGBRow = I422AlphaToARGBRow_AVX2; } } #endif #if defined(HAS_I422ALPHATOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422AlphaToARGBRow = I422AlphaToARGBRow_NEON; } } #endif #if defined(HAS_I422ALPHATOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { I422AlphaToARGBRow = I422AlphaToARGBRow_MMI; } } #endif #if defined(HAS_I422ALPHATOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA; if (IS_ALIGNED(width, 8)) { I422AlphaToARGBRow = I422AlphaToARGBRow_MSA; } } #endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif #if defined(HAS_ARGBATTENUATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_AVX2; } } #endif #if defined(HAS_ARGBATTENUATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_NEON; } } #endif #if defined(HAS_ARGBATTENUATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBAttenuateRow = ARGBAttenuateRow_MMI; } } #endif #if defined(HAS_ARGBATTENUATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_MSA; } } #endif for (y = 0; y < height; ++y) { I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, width); if (attenuate) { ARGBAttenuateRow(dst_argb, dst_argb, width); } dst_argb += dst_stride_argb; src_a += src_stride_a; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } // Convert I444 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I444AlphaToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate) { int y; void (*I444AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = I444AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I444ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I444AlphaToARGBRow = I444AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I444AlphaToARGBRow = I444AlphaToARGBRow_SSSE3; } } #endif #if defined(HAS_I444ALPHATOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I444AlphaToARGBRow = I444AlphaToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I444AlphaToARGBRow = I444AlphaToARGBRow_AVX2; } } #endif #if defined(HAS_I444ALPHATOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I444AlphaToARGBRow = I444AlphaToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I444AlphaToARGBRow = I444AlphaToARGBRow_NEON; } } #endif #if defined(HAS_I444ALPHATOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { I444AlphaToARGBRow = I444AlphaToARGBRow_MMI; } } #endif #if defined(HAS_I444ALPHATOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I444AlphaToARGBRow = I444AlphaToARGBRow_Any_MSA; if (IS_ALIGNED(width, 8)) { I444AlphaToARGBRow = I444AlphaToARGBRow_MSA; } } #endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif #if defined(HAS_ARGBATTENUATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_AVX2; } } #endif #if defined(HAS_ARGBATTENUATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_NEON; } } #endif #if defined(HAS_ARGBATTENUATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBAttenuateRow = ARGBAttenuateRow_MMI; } } #endif #if defined(HAS_ARGBATTENUATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_MSA; } } #endif for (y = 0; y < height; ++y) { I444AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, width); if (attenuate) { ARGBAttenuateRow(dst_argb, dst_argb, width); } dst_argb += dst_stride_argb; src_a += src_stride_a; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } // Convert I420 with Alpha to ARGB. LIBYUV_API int I420AlphaToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int attenuate) { return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, src_stride_a, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height, attenuate); } // Convert I420 with Alpha to ABGR. LIBYUV_API int I420AlphaToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height, int attenuate) { return I420AlphaToARGBMatrix( src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height, attenuate); } // Convert I422 with Alpha to ARGB. LIBYUV_API int I422AlphaToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int attenuate) { return I422AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, src_stride_a, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height, attenuate); } // Convert I422 with Alpha to ABGR. LIBYUV_API int I422AlphaToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height, int attenuate) { return I422AlphaToARGBMatrix( src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height, attenuate); } // Convert I444 with Alpha to ARGB. LIBYUV_API int I444AlphaToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int attenuate) { return I444AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_a, src_stride_a, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height, attenuate); } // Convert I444 with Alpha to ABGR. LIBYUV_API int I444AlphaToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, const uint8_t* src_a, int src_stride_a, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height, int attenuate) { return I444AlphaToARGBMatrix( src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr, &kYvuI601Constants, // Use Yvu matrix width, height, attenuate); } // Convert I010 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I010AlphaToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, const uint16_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate) { int y; void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = I210AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I210ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; } } #endif #if defined(HAS_I210ALPHATOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; } } #endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif #if defined(HAS_ARGBATTENUATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_AVX2; } } #endif #if defined(HAS_ARGBATTENUATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_NEON; } } #endif #if defined(HAS_ARGBATTENUATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBAttenuateRow = ARGBAttenuateRow_MMI; } } #endif #if defined(HAS_ARGBATTENUATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_MSA; } } #endif for (y = 0; y < height; ++y) { I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, width); if (attenuate) { ARGBAttenuateRow(dst_argb, dst_argb, width); } dst_argb += dst_stride_argb; src_a += src_stride_a; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert I210 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I210AlphaToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, const uint16_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate) { int y; void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = I210AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I210ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; } } #endif #if defined(HAS_I210ALPHATOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; } } #endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif #if defined(HAS_ARGBATTENUATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_AVX2; } } #endif #if defined(HAS_ARGBATTENUATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_NEON; } } #endif #if defined(HAS_ARGBATTENUATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBAttenuateRow = ARGBAttenuateRow_MMI; } } #endif #if defined(HAS_ARGBATTENUATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_MSA; } } #endif for (y = 0; y < height; ++y) { I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, width); if (attenuate) { ARGBAttenuateRow(dst_argb, dst_argb, width); } dst_argb += dst_stride_argb; src_a += src_stride_a; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } // Convert I410 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I410AlphaToARGBMatrix(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, const uint16_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height, int attenuate) { int y; void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = I410AlphaToARGBRow_C; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_I410ALPHATOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; } } #endif #if defined(HAS_I410ALPHATOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2; } } #endif #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif #if defined(HAS_ARGBATTENUATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_AVX2; } } #endif #if defined(HAS_ARGBATTENUATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_NEON; } } #endif #if defined(HAS_ARGBATTENUATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBAttenuateRow = ARGBAttenuateRow_MMI; } } #endif #if defined(HAS_ARGBATTENUATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_MSA; } } #endif for (y = 0; y < height; ++y) { I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, width); if (attenuate) { ARGBAttenuateRow(dst_argb, dst_argb, width); } dst_argb += dst_stride_argb; src_a += src_stride_a; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } // Convert I400 to ARGB with matrix. LIBYUV_API int I400ToARGBMatrix(const uint8_t* src_y, int src_stride_y, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I400ToARGBRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } // Coalesce rows. if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; } #if defined(HAS_I400TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I400ToARGBRow = I400ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { I400ToARGBRow = I400ToARGBRow_SSE2; } } #endif #if defined(HAS_I400TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I400ToARGBRow = I400ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I400ToARGBRow = I400ToARGBRow_AVX2; } } #endif #if defined(HAS_I400TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I400ToARGBRow = I400ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I400ToARGBRow = I400ToARGBRow_NEON; } } #endif #if defined(HAS_I400TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I400ToARGBRow = I400ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 8)) { I400ToARGBRow = I400ToARGBRow_MMI; } } #endif #if defined(HAS_I400TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I400ToARGBRow = I400ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 16)) { I400ToARGBRow = I400ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { I400ToARGBRow(src_y, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; } return 0; } // Convert I400 to ARGB. LIBYUV_API int I400ToARGB(const uint8_t* src_y, int src_stride_y, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height); } // Convert J400 to ARGB. LIBYUV_API int J400ToARGB(const uint8_t* src_y, int src_stride_y, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) = J400ToARGBRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } // Coalesce rows. if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; } #if defined(HAS_J400TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { J400ToARGBRow = J400ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { J400ToARGBRow = J400ToARGBRow_SSE2; } } #endif #if defined(HAS_J400TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { J400ToARGBRow = J400ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { J400ToARGBRow = J400ToARGBRow_AVX2; } } #endif #if defined(HAS_J400TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { J400ToARGBRow = J400ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { J400ToARGBRow = J400ToARGBRow_NEON; } } #endif #if defined(HAS_J400TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { J400ToARGBRow = J400ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { J400ToARGBRow = J400ToARGBRow_MMI; } } #endif #if defined(HAS_J400TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { J400ToARGBRow = J400ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 16)) { J400ToARGBRow = J400ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { J400ToARGBRow(src_y, dst_argb, width); src_y += src_stride_y; dst_argb += dst_stride_argb; } return 0; } // Shuffle table for converting BGRA to ARGB. static const uvec8 kShuffleMaskBGRAToARGB = { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u}; // Shuffle table for converting ABGR to ARGB. static const uvec8 kShuffleMaskABGRToARGB = { 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u}; // Shuffle table for converting RGBA to ARGB. static const uvec8 kShuffleMaskRGBAToARGB = { 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; // Shuffle table for converting AR64 to AB64. static const uvec8 kShuffleMaskAR64ToAB64 = { 4u, 5u, 2u, 3u, 0u, 1u, 6u, 7u, 12u, 13u, 10u, 11u, 8u, 9u, 14u, 15u}; // Convert BGRA to ARGB. LIBYUV_API int BGRAToARGB(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); } // Convert ARGB to BGRA (same as BGRAToARGB). LIBYUV_API int ARGBToBGRA(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); } // Convert ABGR to ARGB. LIBYUV_API int ABGRToARGB(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); } // Convert ARGB to ABGR to (same as ABGRToARGB). LIBYUV_API int ARGBToABGR(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); } // Convert RGBA to ARGB. LIBYUV_API int RGBAToARGB(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb, (const uint8_t*)&kShuffleMaskRGBAToARGB, width, height); } // Convert AR64 To AB64. LIBYUV_API int AR64ToAB64(const uint16_t* src_ar64, int src_stride_ar64, uint16_t* dst_ab64, int dst_stride_ab64, int width, int height) { return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64, (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height); } // Convert RGB24 to ARGB. LIBYUV_API int RGB24ToARGB(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RGB24ToARGBRow_C; if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; src_stride_rgb24 = -src_stride_rgb24; } // Coalesce rows. if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_rgb24 = dst_stride_argb = 0; } #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } #endif #if defined(HAS_RGB24TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB24ToARGBRow = RGB24ToARGBRow_NEON; } } #endif #if defined(HAS_RGB24TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { RGB24ToARGBRow = RGB24ToARGBRow_MMI; } } #endif #if defined(HAS_RGB24TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RGB24ToARGBRow = RGB24ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); src_rgb24 += src_stride_rgb24; dst_argb += dst_stride_argb; } return 0; } // Convert RAW to ARGB. LIBYUV_API int RAWToARGB(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) = RAWToARGBRow_C; if (!src_raw || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_raw = src_raw + (height - 1) * src_stride_raw; src_stride_raw = -src_stride_raw; } // Coalesce rows. if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_raw = dst_stride_argb = 0; } #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RAWToARGBRow = RAWToARGBRow_SSSE3; } } #endif #if defined(HAS_RAWTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToARGBRow = RAWToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RAWToARGBRow = RAWToARGBRow_NEON; } } #endif #if defined(HAS_RAWTOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RAWToARGBRow = RAWToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { RAWToARGBRow = RAWToARGBRow_MMI; } } #endif #if defined(HAS_RAWTOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RAWToARGBRow = RAWToARGBRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RAWToARGBRow = RAWToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { RAWToARGBRow(src_raw, dst_argb, width); src_raw += src_stride_raw; dst_argb += dst_stride_argb; } return 0; } // Convert RAW to RGBA. LIBYUV_API int RAWToRGBA(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_rgba, int dst_stride_rgba, int width, int height) { int y; void (*RAWToRGBARow)(const uint8_t* src_rgb, uint8_t* dst_rgba, int width) = RAWToRGBARow_C; if (!src_raw || !dst_rgba || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_raw = src_raw + (height - 1) * src_stride_raw; src_stride_raw = -src_stride_raw; } // Coalesce rows. if (src_stride_raw == width * 3 && dst_stride_rgba == width * 4) { width *= height; height = 1; src_stride_raw = dst_stride_rgba = 0; } #if defined(HAS_RAWTORGBAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToRGBARow = RAWToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RAWToRGBARow = RAWToRGBARow_SSSE3; } } #endif #if defined(HAS_RAWTORGBAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToRGBARow = RAWToRGBARow_Any_NEON; if (IS_ALIGNED(width, 8)) { RAWToRGBARow = RAWToRGBARow_NEON; } } #endif for (y = 0; y < height; ++y) { RAWToRGBARow(src_raw, dst_rgba, width); src_raw += src_stride_raw; dst_rgba += dst_stride_rgba; } return 0; } // Convert RGB565 to ARGB. LIBYUV_API int RGB565ToARGB(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) = RGB565ToARGBRow_C; if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; src_stride_rgb565 = -src_stride_rgb565; } // Coalesce rows. if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_rgb565 = dst_stride_argb = 0; } #if defined(HAS_RGB565TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { RGB565ToARGBRow = RGB565ToARGBRow_SSE2; } } #endif #if defined(HAS_RGB565TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { RGB565ToARGBRow = RGB565ToARGBRow_AVX2; } } #endif #if defined(HAS_RGB565TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB565ToARGBRow = RGB565ToARGBRow_NEON; } } #endif #if defined(HAS_RGB565TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { RGB565ToARGBRow = RGB565ToARGBRow_MMI; } } #endif #if defined(HAS_RGB565TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RGB565ToARGBRow = RGB565ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { RGB565ToARGBRow(src_rgb565, dst_argb, width); src_rgb565 += src_stride_rgb565; dst_argb += dst_stride_argb; } return 0; } // Convert ARGB1555 to ARGB. LIBYUV_API int ARGB1555ToARGB(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) = ARGB1555ToARGBRow_C; if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; src_stride_argb1555 = -src_stride_argb1555; } // Coalesce rows. if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb1555 = dst_stride_argb = 0; } #if defined(HAS_ARGB1555TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; } } #endif #if defined(HAS_ARGB1555TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; } } #endif #if defined(HAS_ARGB1555TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; } } #endif #if defined(HAS_ARGB1555TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_MMI; } } #endif #if defined(HAS_ARGB1555TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { ARGB1555ToARGBRow(src_argb1555, dst_argb, width); src_argb1555 += src_stride_argb1555; dst_argb += dst_stride_argb; } return 0; } // Convert ARGB4444 to ARGB. LIBYUV_API int ARGB4444ToARGB(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) = ARGB4444ToARGBRow_C; if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; src_stride_argb4444 = -src_stride_argb4444; } // Coalesce rows. if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb4444 = dst_stride_argb = 0; } #if defined(HAS_ARGB4444TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; } } #endif #if defined(HAS_ARGB4444TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; } } #endif #if defined(HAS_ARGB4444TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON; } } #endif #if defined(HAS_ARGB4444TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_MMI; } } #endif #if defined(HAS_ARGB4444TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { ARGB4444ToARGBRow(src_argb4444, dst_argb, width); src_argb4444 += src_stride_argb4444; dst_argb += dst_stride_argb; } return 0; } // Convert AR30 to ARGB. LIBYUV_API int AR30ToARGB(const uint8_t* src_ar30, int src_stride_ar30, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; if (!src_ar30 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; src_stride_ar30 = -src_stride_ar30; } // Coalesce rows. if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_ar30 = dst_stride_argb = 0; } for (y = 0; y < height; ++y) { AR30ToARGBRow_C(src_ar30, dst_argb, width); src_ar30 += src_stride_ar30; dst_argb += dst_stride_argb; } return 0; } // Convert AR30 to ABGR. LIBYUV_API int AR30ToABGR(const uint8_t* src_ar30, int src_stride_ar30, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { int y; if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; src_stride_ar30 = -src_stride_ar30; } // Coalesce rows. if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) { width *= height; height = 1; src_stride_ar30 = dst_stride_abgr = 0; } for (y = 0; y < height; ++y) { AR30ToABGRRow_C(src_ar30, dst_abgr, width); src_ar30 += src_stride_ar30; dst_abgr += dst_stride_abgr; } return 0; } // Convert AR30 to AB30. LIBYUV_API int AR30ToAB30(const uint8_t* src_ar30, int src_stride_ar30, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height) { int y; if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_ar30 = src_ar30 + (height - 1) * src_stride_ar30; src_stride_ar30 = -src_stride_ar30; } // Coalesce rows. if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) { width *= height; height = 1; src_stride_ar30 = dst_stride_ab30 = 0; } for (y = 0; y < height; ++y) { AR30ToAB30Row_C(src_ar30, dst_ab30, width); src_ar30 += src_stride_ar30; dst_ab30 += dst_stride_ab30; } return 0; } // Convert AR64 to ARGB. LIBYUV_API int AR64ToARGB(const uint16_t* src_ar64, int src_stride_ar64, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, int width) = AR64ToARGBRow_C; if (!src_ar64 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; src_stride_ar64 = -src_stride_ar64; } // Coalesce rows. if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_ar64 = dst_stride_argb = 0; } #if defined(HAS_AR64TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { AR64ToARGBRow = AR64ToARGBRow_SSSE3; } } #endif #if defined(HAS_AR64TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { AR64ToARGBRow = AR64ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { AR64ToARGBRow = AR64ToARGBRow_AVX2; } } #endif #if defined(HAS_AR64TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { AR64ToARGBRow = AR64ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { AR64ToARGBRow = AR64ToARGBRow_NEON; } } #endif for (y = 0; y < height; ++y) { AR64ToARGBRow(src_ar64, dst_argb, width); src_ar64 += src_stride_ar64; dst_argb += dst_stride_argb; } return 0; } // Convert AB64 to ARGB. LIBYUV_API int AB64ToARGB(const uint16_t* src_ab64, int src_stride_ab64, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, int width) = AB64ToARGBRow_C; if (!src_ab64 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_ab64 = src_ab64 + (height - 1) * src_stride_ab64; src_stride_ab64 = -src_stride_ab64; } // Coalesce rows. if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_ab64 = dst_stride_argb = 0; } #if defined(HAS_AB64TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { AB64ToARGBRow = AB64ToARGBRow_SSSE3; } } #endif #if defined(HAS_AB64TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { AB64ToARGBRow = AB64ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { AB64ToARGBRow = AB64ToARGBRow_AVX2; } } #endif #if defined(HAS_AB64TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { AB64ToARGBRow = AB64ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { AB64ToARGBRow = AB64ToARGBRow_NEON; } } #endif for (y = 0; y < height; ++y) { AB64ToARGBRow(src_ab64, dst_argb, width); src_ab64 += src_stride_ab64; dst_argb += dst_stride_argb; } return 0; } // Convert NV12 to ARGB with matrix. LIBYUV_API int NV12ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*NV12ToARGBRow)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C; if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_NV12TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { NV12ToARGBRow = NV12ToARGBRow_SSSE3; } } #endif #if defined(HAS_NV12TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { NV12ToARGBRow = NV12ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { NV12ToARGBRow = NV12ToARGBRow_AVX2; } } #endif #if defined(HAS_NV12TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { NV12ToARGBRow = NV12ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { NV12ToARGBRow = NV12ToARGBRow_NEON; } } #endif #if defined(HAS_NV12TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { NV12ToARGBRow = NV12ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { NV12ToARGBRow = NV12ToARGBRow_MMI; } } #endif #if defined(HAS_NV12TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { NV12ToARGBRow = NV12ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 8)) { NV12ToARGBRow = NV12ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { src_uv += src_stride_uv; } } return 0; } // Convert NV21 to ARGB with matrix. LIBYUV_API int NV21ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*NV21ToARGBRow)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C; if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } #if defined(HAS_NV21TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { NV21ToARGBRow = NV21ToARGBRow_SSSE3; } } #endif #if defined(HAS_NV21TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { NV21ToARGBRow = NV21ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { NV21ToARGBRow = NV21ToARGBRow_AVX2; } } #endif #if defined(HAS_NV21TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { NV21ToARGBRow = NV21ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { NV21ToARGBRow = NV21ToARGBRow_NEON; } } #endif #if defined(HAS_NV21TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { NV21ToARGBRow = NV21ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { NV21ToARGBRow = NV21ToARGBRow_MMI; } } #endif #if defined(HAS_NV21TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { NV21ToARGBRow = NV21ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 8)) { NV21ToARGBRow = NV21ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { src_vu += src_stride_vu; } } return 0; } // Convert NV12 to ARGB. LIBYUV_API int NV12ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height); } // Convert NV21 to ARGB. LIBYUV_API int NV21ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height); } // Convert NV12 to ABGR. // To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix. // To swap the UV use NV12 instead of NV21.LIBYUV_API LIBYUV_API int NV12ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr, dst_stride_abgr, &kYvuI601Constants, width, height); } // Convert NV21 to ABGR. LIBYUV_API int NV21ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr, dst_stride_abgr, &kYvuI601Constants, width, height); } // TODO(fbarchard): Consider SSSE3 2 step conversion. // Convert NV12 to RGB24 with matrix. LIBYUV_API int NV12ToRGB24Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_rgb24, int dst_stride_rgb24, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*NV12ToRGB24Row)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C; if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; dst_stride_rgb24 = -dst_stride_rgb24; } #if defined(HAS_NV12TORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { NV12ToRGB24Row = NV12ToRGB24Row_NEON; } } #endif #if defined(HAS_NV12TORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { NV12ToRGB24Row = NV12ToRGB24Row_SSSE3; } } #endif #if defined(HAS_NV12TORGB24ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2; if (IS_ALIGNED(width, 32)) { NV12ToRGB24Row = NV12ToRGB24Row_AVX2; } } #endif #if defined(HAS_NV12TORGB24ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { NV12ToRGB24Row = NV12ToRGB24Row_Any_MMI; if (IS_ALIGNED(width, 8)) { NV12ToRGB24Row = NV12ToRGB24Row_MMI; } } #endif for (y = 0; y < height; ++y) { NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width); dst_rgb24 += dst_stride_rgb24; src_y += src_stride_y; if (y & 1) { src_uv += src_stride_uv; } } return 0; } // Convert NV21 to RGB24 with matrix. LIBYUV_API int NV21ToRGB24Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_rgb24, int dst_stride_rgb24, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*NV21ToRGB24Row)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C; if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; dst_stride_rgb24 = -dst_stride_rgb24; } #if defined(HAS_NV21TORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { NV21ToRGB24Row = NV21ToRGB24Row_NEON; } } #endif #if defined(HAS_NV21TORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { NV21ToRGB24Row = NV21ToRGB24Row_SSSE3; } } #endif #if defined(HAS_NV21TORGB24ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2; if (IS_ALIGNED(width, 32)) { NV21ToRGB24Row = NV21ToRGB24Row_AVX2; } } #endif #if defined(HAS_NV21TORGB24ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { NV21ToRGB24Row = NV21ToRGB24Row_Any_MMI; if (IS_ALIGNED(width, 8)) { NV21ToRGB24Row = NV21ToRGB24Row_MMI; } } #endif for (y = 0; y < height; ++y) { NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width); dst_rgb24 += dst_stride_rgb24; src_y += src_stride_y; if (y & 1) { src_vu += src_stride_vu; } } return 0; } // Convert NV12 to RGB24. LIBYUV_API int NV12ToRGB24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height) { return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, width, height); } // Convert NV21 to RGB24. LIBYUV_API int NV21ToRGB24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height) { return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, width, height); } // Convert NV12 to RAW. LIBYUV_API int NV12ToRAW(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_raw, int dst_stride_raw, int width, int height) { return NV21ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_raw, dst_stride_raw, &kYvuI601Constants, width, height); } // Convert NV21 to RAW. LIBYUV_API int NV21ToRAW(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_raw, int dst_stride_raw, int width, int height) { return NV12ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_raw, dst_stride_raw, &kYvuI601Constants, width, height); } // Convert NV21 to YUV24 int NV21ToYUV24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_yuv24, int dst_stride_yuv24, int width, int height) { int y; void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C; if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24; dst_stride_yuv24 = -dst_stride_yuv24; } #if defined(HAS_NV21TOYUV24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON; if (IS_ALIGNED(width, 16)) { NV21ToYUV24Row = NV21ToYUV24Row_NEON; } } #endif #if defined(HAS_NV21TOYUV24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { NV21ToYUV24Row = NV21ToYUV24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { NV21ToYUV24Row = NV21ToYUV24Row_SSSE3; } } #endif #if defined(HAS_NV21TOYUV24ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2; if (IS_ALIGNED(width, 32)) { NV21ToYUV24Row = NV21ToYUV24Row_AVX2; } } #endif for (y = 0; y < height; ++y) { NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width); dst_yuv24 += dst_stride_yuv24; src_y += src_stride_y; if (y & 1) { src_vu += src_stride_vu; } } return 0; } // Convert YUY2 to ARGB. LIBYUV_API int YUY2ToARGB(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = YUY2ToARGBRow_C; if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } // Coalesce rows. if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_yuy2 = dst_stride_argb = 0; } #if defined(HAS_YUY2TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; } } #endif #if defined(HAS_YUY2TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { YUY2ToARGBRow = YUY2ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { YUY2ToARGBRow = YUY2ToARGBRow_AVX2; } } #endif #if defined(HAS_YUY2TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { YUY2ToARGBRow = YUY2ToARGBRow_NEON; } } #endif #if defined(HAS_YUY2TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { YUY2ToARGBRow = YUY2ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { YUY2ToARGBRow = YUY2ToARGBRow_MMI; } } #endif #if defined(HAS_YUY2TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 8)) { YUY2ToARGBRow = YUY2ToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width); src_yuy2 += src_stride_yuy2; dst_argb += dst_stride_argb; } return 0; } // Convert UYVY to ARGB. LIBYUV_API int UYVYToARGB(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) = UYVYToARGBRow_C; if (!src_uyvy || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } // Coalesce rows. if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_uyvy = dst_stride_argb = 0; } #if defined(HAS_UYVYTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { UYVYToARGBRow = UYVYToARGBRow_SSSE3; } } #endif #if defined(HAS_UYVYTOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { UYVYToARGBRow = UYVYToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { UYVYToARGBRow = UYVYToARGBRow_AVX2; } } #endif #if defined(HAS_UYVYTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { UYVYToARGBRow = UYVYToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { UYVYToARGBRow = UYVYToARGBRow_NEON; } } #endif #if defined(HAS_UYVYTOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { UYVYToARGBRow = UYVYToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { UYVYToARGBRow = UYVYToARGBRow_MMI; } } #endif #if defined(HAS_UYVYTOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { UYVYToARGBRow = UYVYToARGBRow_Any_MSA; if (IS_ALIGNED(width, 8)) { UYVYToARGBRow = UYVYToARGBRow_MSA; } } #endif for (y = 0; y < height; ++y) { UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width); src_uyvy += src_stride_uyvy; dst_argb += dst_stride_argb; } return 0; } static void WeavePixels(const uint8_t* src_u, const uint8_t* src_v, int src_pixel_stride_uv, uint8_t* dst_uv, int width) { int i; for (i = 0; i < width; ++i) { dst_uv[0] = *src_u; dst_uv[1] = *src_v; dst_uv += 2; src_u += src_pixel_stride_uv; src_v += src_pixel_stride_uv; } } // Convert Android420 to ARGB with matrix. LIBYUV_API int Android420ToARGBMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, uint8_t* dst_argb, int dst_stride_argb, const struct YuvConstants* yuvconstants, int width, int height) { int y; uint8_t* dst_uv; const ptrdiff_t vu_off = src_v - src_u; int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } // I420 if (src_pixel_stride_uv == 1) { return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_argb, dst_stride_argb, yuvconstants, width, height); // NV21 } if (src_pixel_stride_uv == 2 && vu_off == -1 && src_stride_u == src_stride_v) { return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb, dst_stride_argb, yuvconstants, width, height); // NV12 } if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb, dst_stride_argb, yuvconstants, width, height); } // General case fallback creates NV12 align_buffer_64(plane_uv, halfwidth * 2 * halfheight); dst_uv = plane_uv; for (y = 0; y < halfheight; ++y) { WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth); src_u += src_stride_u; src_v += src_stride_v; dst_uv += halfwidth * 2; } NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb, dst_stride_argb, yuvconstants, width, height); free_aligned_buffer_64(plane_uv); return 0; } // Convert Android420 to ARGB. LIBYUV_API int Android420ToARGB(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_pixel_stride_uv, dst_argb, dst_stride_argb, &kYuvI601Constants, width, height); } // Convert Android420 to ABGR. LIBYUV_API int Android420ToABGR(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, uint8_t* dst_abgr, int dst_stride_abgr, int width, int height) { return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, src_pixel_stride_uv, dst_abgr, dst_stride_abgr, &kYvuI601Constants, width, height); } // Convert I422 to RGBA with matrix. LIBYUV_API int I422ToRGBAMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgba, int dst_stride_rgba, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGBARow_C; if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; dst_stride_rgba = -dst_stride_rgba; } #if defined(HAS_I422TORGBAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToRGBARow = I422ToRGBARow_SSSE3; } } #endif #if defined(HAS_I422TORGBAROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToRGBARow = I422ToRGBARow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToRGBARow = I422ToRGBARow_AVX2; } } #endif #if defined(HAS_I422TORGBAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToRGBARow = I422ToRGBARow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToRGBARow = I422ToRGBARow_NEON; } } #endif #if defined(HAS_I422TORGBAROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToRGBARow = I422ToRGBARow_Any_MMI; if (IS_ALIGNED(width, 4)) { I422ToRGBARow = I422ToRGBARow_MMI; } } #endif #if defined(HAS_I422TORGBAROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToRGBARow = I422ToRGBARow_Any_MSA; if (IS_ALIGNED(width, 8)) { I422ToRGBARow = I422ToRGBARow_MSA; } } #endif for (y = 0; y < height; ++y) { I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); dst_rgba += dst_stride_rgba; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } // Convert I422 to RGBA. LIBYUV_API int I422ToRGBA(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgba, int dst_stride_rgba, int width, int height) { return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_rgba, dst_stride_rgba, &kYuvI601Constants, width, height); } // Convert I422 to BGRA. LIBYUV_API int I422ToBGRA(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_bgra, int dst_stride_bgra, int width, int height) { return I422ToRGBAMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_bgra, dst_stride_bgra, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert NV12 to RGB565 with matrix. LIBYUV_API int NV12ToRGB565Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_rgb565, int dst_stride_rgb565, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*NV12ToRGB565Row)( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C; if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } #if defined(HAS_NV12TORGB565ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; } } #endif #if defined(HAS_NV12TORGB565ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { NV12ToRGB565Row = NV12ToRGB565Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { NV12ToRGB565Row = NV12ToRGB565Row_AVX2; } } #endif #if defined(HAS_NV12TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { NV12ToRGB565Row = NV12ToRGB565Row_NEON; } } #endif #if defined(HAS_NV12TORGB565ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { NV12ToRGB565Row = NV12ToRGB565Row_Any_MMI; if (IS_ALIGNED(width, 4)) { NV12ToRGB565Row = NV12ToRGB565Row_MMI; } } #endif #if defined(HAS_NV12TORGB565ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA; if (IS_ALIGNED(width, 8)) { NV12ToRGB565Row = NV12ToRGB565Row_MSA; } } #endif for (y = 0; y < height; ++y) { NV12ToRGB565Row(src_y, src_uv, dst_rgb565, yuvconstants, width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { src_uv += src_stride_uv; } } return 0; } // Convert NV12 to RGB565. LIBYUV_API int NV12ToRGB565(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { return NV12ToRGB565Matrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_rgb565, dst_stride_rgb565, &kYuvI601Constants, width, height); } // Convert I422 to RGBA with matrix. LIBYUV_API int I420ToRGBAMatrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgba, int dst_stride_rgba, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGBARow_C; if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba; dst_stride_rgba = -dst_stride_rgba; } #if defined(HAS_I422TORGBAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToRGBARow = I422ToRGBARow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToRGBARow = I422ToRGBARow_SSSE3; } } #endif #if defined(HAS_I422TORGBAROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToRGBARow = I422ToRGBARow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToRGBARow = I422ToRGBARow_AVX2; } } #endif #if defined(HAS_I422TORGBAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToRGBARow = I422ToRGBARow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToRGBARow = I422ToRGBARow_NEON; } } #endif #if defined(HAS_I422TORGBAROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToRGBARow = I422ToRGBARow_Any_MMI; if (IS_ALIGNED(width, 4)) { I422ToRGBARow = I422ToRGBARow_MMI; } } #endif #if defined(HAS_I422TORGBAROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToRGBARow = I422ToRGBARow_Any_MSA; if (IS_ALIGNED(width, 8)) { I422ToRGBARow = I422ToRGBARow_MSA; } } #endif for (y = 0; y < height; ++y) { I422ToRGBARow(src_y, src_u, src_v, dst_rgba, yuvconstants, width); dst_rgba += dst_stride_rgba; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert I420 to RGBA. LIBYUV_API int I420ToRGBA(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgba, int dst_stride_rgba, int width, int height) { return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_rgba, dst_stride_rgba, &kYuvI601Constants, width, height); } // Convert I420 to BGRA. LIBYUV_API int I420ToBGRA(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_bgra, int dst_stride_bgra, int width, int height) { return I420ToRGBAMatrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_bgra, dst_stride_bgra, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert I420 to RGB24 with matrix. LIBYUV_API int I420ToRGB24Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb24, int dst_stride_rgb24, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGB24Row_C; if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24; dst_stride_rgb24 = -dst_stride_rgb24; } #if defined(HAS_I422TORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } #endif #if defined(HAS_I422TORGB24ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; if (IS_ALIGNED(width, 32)) { I422ToRGB24Row = I422ToRGB24Row_AVX2; } } #endif #if defined(HAS_I422TORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToRGB24Row = I422ToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToRGB24Row = I422ToRGB24Row_NEON; } } #endif #if defined(HAS_I422TORGB24ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToRGB24Row = I422ToRGB24Row_Any_MMI; if (IS_ALIGNED(width, 4)) { I422ToRGB24Row = I422ToRGB24Row_MMI; } } #endif #if defined(HAS_I422TORGB24ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToRGB24Row = I422ToRGB24Row_Any_MSA; if (IS_ALIGNED(width, 16)) { I422ToRGB24Row = I422ToRGB24Row_MSA; } } #endif for (y = 0; y < height; ++y) { I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width); dst_rgb24 += dst_stride_rgb24; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert I420 to RGB24. LIBYUV_API int I420ToRGB24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height) { return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_rgb24, dst_stride_rgb24, &kYuvI601Constants, width, height); } // Convert I420 to RAW. LIBYUV_API int I420ToRAW(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_raw, int dst_stride_raw, int width, int height) { return I420ToRGB24Matrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_raw, dst_stride_raw, &kYvuI601Constants, // Use Yvu matrix width, height); } // Convert J420 to RGB24. LIBYUV_API int J420ToRGB24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height) { return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_rgb24, dst_stride_rgb24, &kYuvJPEGConstants, width, height); } // Convert J420 to RAW. LIBYUV_API int J420ToRAW(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_raw, int dst_stride_raw, int width, int height) { return I420ToRGB24Matrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_raw, dst_stride_raw, &kYvuJPEGConstants, // Use Yvu matrix width, height); } // Convert H420 to RGB24. LIBYUV_API int H420ToRGB24(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height) { return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_rgb24, dst_stride_rgb24, &kYuvH709Constants, width, height); } // Convert H420 to RAW. LIBYUV_API int H420ToRAW(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_raw, int dst_stride_raw, int width, int height) { return I420ToRGB24Matrix(src_y, src_stride_y, src_v, src_stride_v, // Swap U and V src_u, src_stride_u, dst_raw, dst_stride_raw, &kYvuH709Constants, // Use Yvu matrix width, height); } // Convert I420 to ARGB1555. LIBYUV_API int I420ToARGB1555(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb1555, int dst_stride_argb1555, int width, int height) { int y; void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGB1555Row_C; if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555; dst_stride_argb1555 = -dst_stride_argb1555; } #if defined(HAS_I422TOARGB1555ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToARGB1555Row = I422ToARGB1555Row_SSSE3; } } #endif #if defined(HAS_I422TOARGB1555ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToARGB1555Row = I422ToARGB1555Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToARGB1555Row = I422ToARGB1555Row_AVX2; } } #endif #if defined(HAS_I422TOARGB1555ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGB1555Row = I422ToARGB1555Row_NEON; } } #endif #if defined(HAS_I422TOARGB1555ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToARGB1555Row = I422ToARGB1555Row_Any_MMI; if (IS_ALIGNED(width, 4)) { I422ToARGB1555Row = I422ToARGB1555Row_MMI; } } #endif #if defined(HAS_I422TOARGB1555ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA; if (IS_ALIGNED(width, 8)) { I422ToARGB1555Row = I422ToARGB1555Row_MSA; } } #endif for (y = 0; y < height; ++y) { I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants, width); dst_argb1555 += dst_stride_argb1555; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert I420 to ARGB4444. LIBYUV_API int I420ToARGB4444(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_argb4444, int dst_stride_argb4444, int width, int height) { int y; void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGB4444Row_C; if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444; dst_stride_argb4444 = -dst_stride_argb4444; } #if defined(HAS_I422TOARGB4444ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToARGB4444Row = I422ToARGB4444Row_SSSE3; } } #endif #if defined(HAS_I422TOARGB4444ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToARGB4444Row = I422ToARGB4444Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToARGB4444Row = I422ToARGB4444Row_AVX2; } } #endif #if defined(HAS_I422TOARGB4444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGB4444Row = I422ToARGB4444Row_NEON; } } #endif #if defined(HAS_I422TOARGB4444ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToARGB4444Row = I422ToARGB4444Row_Any_MMI; if (IS_ALIGNED(width, 4)) { I422ToARGB4444Row = I422ToARGB4444Row_MMI; } } #endif #if defined(HAS_I422TOARGB4444ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA; if (IS_ALIGNED(width, 8)) { I422ToARGB4444Row = I422ToARGB4444Row_MSA; } } #endif for (y = 0; y < height; ++y) { I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants, width); dst_argb4444 += dst_stride_argb4444; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert I420 to RGB565 with specified color matrix. LIBYUV_API int I420ToRGB565Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGB565Row_C; if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } #if defined(HAS_I422TORGB565ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToRGB565Row = I422ToRGB565Row_SSSE3; } } #endif #if defined(HAS_I422TORGB565ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToRGB565Row = I422ToRGB565Row_AVX2; } } #endif #if defined(HAS_I422TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToRGB565Row = I422ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToRGB565Row = I422ToRGB565Row_NEON; } } #endif #if defined(HAS_I422TORGB565ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToRGB565Row = I422ToRGB565Row_Any_MMI; if (IS_ALIGNED(width, 4)) { I422ToRGB565Row = I422ToRGB565Row_MMI; } } #endif #if defined(HAS_I422TORGB565ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToRGB565Row = I422ToRGB565Row_Any_MSA; if (IS_ALIGNED(width, 8)) { I422ToRGB565Row = I422ToRGB565Row_MSA; } } #endif for (y = 0; y < height; ++y) { I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, yuvconstants, width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert I420 to RGB565. LIBYUV_API int I420ToRGB565(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_rgb565, dst_stride_rgb565, &kYuvI601Constants, width, height); } // Convert J420 to RGB565. LIBYUV_API int J420ToRGB565(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_rgb565, dst_stride_rgb565, &kYuvJPEGConstants, width, height); } // Convert H420 to RGB565. LIBYUV_API int H420ToRGB565(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { return I420ToRGB565Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_rgb565, dst_stride_rgb565, &kYuvH709Constants, width, height); } // Convert I422 to RGB565. LIBYUV_API int I422ToRGB565(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { int y; void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToRGB565Row_C; if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } #if defined(HAS_I422TORGB565ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToRGB565Row = I422ToRGB565Row_SSSE3; } } #endif #if defined(HAS_I422TORGB565ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToRGB565Row = I422ToRGB565Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToRGB565Row = I422ToRGB565Row_AVX2; } } #endif #if defined(HAS_I422TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToRGB565Row = I422ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToRGB565Row = I422ToRGB565Row_NEON; } } #endif #if defined(HAS_I422TORGB565ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToRGB565Row = I422ToRGB565Row_Any_MSA; if (IS_ALIGNED(width, 8)) { I422ToRGB565Row = I422ToRGB565Row_MSA; } } #endif for (y = 0; y < height; ++y) { I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; } return 0; } // Ordered 8x8 dither for 888 to 565. Values from 0 to 7. static const uint8_t kDither565_4x4[16] = { 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; // Convert I420 to RGB565 with dithering. LIBYUV_API int I420ToRGB565Dither(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_rgb565, int dst_stride_rgb565, const uint8_t* dither4x4, int width, int height) { int y; void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToARGBRow_C; void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width) = ARGBToRGB565DitherRow_C; if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } if (!dither4x4) { dither4x4 = kDither565_4x4; } #if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif #if defined(HAS_I422TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToARGBRow = I422ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToARGBRow = I422ToARGBRow_AVX2; } } #endif #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_NEON; } } #endif #if defined(HAS_I422TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToARGBRow = I422ToARGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { I422ToARGBRow = I422ToARGBRow_MMI; } } #endif #if defined(HAS_I422TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGBRow = I422ToARGBRow_Any_MSA; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_MSA; } } #endif #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; } } #endif #if defined(HAS_ARGBTORGB565DITHERROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; } } #endif #if defined(HAS_ARGBTORGB565DITHERROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; } } #endif #if defined(HAS_ARGBTORGB565DITHERROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI; if (IS_ALIGNED(width, 4)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI; } } #endif #if defined(HAS_ARGBTORGB565DITHERROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; } } #endif { // Allocate a row of argb. align_buffer_64(row_argb, width * 4); for (y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width); ARGBToRGB565DitherRow(row_argb, dst_rgb565, *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } free_aligned_buffer_64(row_argb); } return 0; } // Convert I420 to AR30 with matrix. LIBYUV_API int I420ToAR30Matrix(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, const struct YuvConstants* yuvconstants, int width, int height) { int y; void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) = I422ToAR30Row_C; if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } #if defined(HAS_I422TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToAR30Row = I422ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { I422ToAR30Row = I422ToAR30Row_SSSE3; } } #endif #if defined(HAS_I422TOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToAR30Row = I422ToAR30Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { I422ToAR30Row = I422ToAR30Row_AVX2; } } #endif for (y = 0; y < height; ++y) { I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); dst_ar30 += dst_stride_ar30; src_y += src_stride_y; if (y & 1) { src_u += src_stride_u; src_v += src_stride_v; } } return 0; } // Convert I420 to AR30. LIBYUV_API int I420ToAR30(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height) { return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_ar30, dst_stride_ar30, &kYuvI601Constants, width, height); } // Convert H420 to AR30. LIBYUV_API int H420ToAR30(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height) { return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_ar30, dst_stride_ar30, &kYvuH709Constants, width, height); } // Convert I420 to AB30. LIBYUV_API int I420ToAB30(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height) { return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, dst_ab30, dst_stride_ab30, &kYvuI601Constants, width, height); } // Convert H420 to AB30. LIBYUV_API int H420ToAB30(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_ab30, int dst_stride_ab30, int width, int height) { return I420ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, dst_ab30, dst_stride_ab30, &kYvuH709Constants, width, height); } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/convert_from.cc000066400000000000000000000666061416500237200220560ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/convert_from.h" #include "libyuv/basic_types.h" #include "libyuv/convert.h" // For I420Copy #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" #include "libyuv/row.h" #include "libyuv/scale.h" // For ScalePlane() #include "libyuv/video_common.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) static __inline int Abs(int v) { return v >= 0 ? v : -v; } // I420 To any I4xx YUV format with mirroring. // TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane static int I420ToI4xx(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int src_y_width, int src_y_height, int dst_uv_width, int dst_uv_height) { const int dst_y_width = Abs(src_y_width); const int dst_y_height = Abs(src_y_height); const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1); const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1); if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 || dst_uv_height <= 0) { return -1; } if (dst_y) { ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y, dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear); } ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u, dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear); ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v, dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear); return 0; } // Convert 8 bit YUV to 10 bit. LIBYUV_API int I420ToI010(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (halfheight - 1) * src_stride_u; src_v = src_v + (halfheight - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } // Convert Y plane. Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width, height); // Convert UV planes. Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth, halfheight); Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth, halfheight); return 0; } // Convert 8 bit YUV to 12 bit. LIBYUV_API int I420ToI012(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (halfheight - 1) * src_stride_u; src_v = src_v + (halfheight - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } // Convert Y plane. Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 4096, width, height); // Convert UV planes. Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 4096, halfwidth, halfheight); Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 4096, halfwidth, halfheight); return 0; } // 420 chroma is 1/2 width, 1/2 height // 422 chroma is 1/2 width, 1x height LIBYUV_API int I420ToI422(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { const int dst_uv_width = (Abs(width) + 1) >> 1; const int dst_uv_height = Abs(height); return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, dst_uv_width, dst_uv_height); } // 420 chroma is 1/2 width, 1/2 height // 444 chroma is 1x width, 1x height LIBYUV_API int I420ToI444(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { const int dst_uv_width = Abs(width); const int dst_uv_height = Abs(height); return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height, dst_uv_width, dst_uv_height); } // 420 chroma to 444 chroma, 10/12 bit version LIBYUV_API int I010ToI410(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height) { if (width == 0 || height == 0) { return -1; } if (dst_y) { ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, Abs(width), Abs(height), kFilterBilinear); } ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width), Abs(height), kFilterBilinear); ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width), Abs(height), kFilterBilinear); return 0; } // 422 chroma to 444 chroma, 10/12 bit version LIBYUV_API int I210ToI410(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height) { if (width == 0 || height == 0) { return -1; } if (dst_y) { ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y, Abs(width), Abs(height), kFilterBilinear); } ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u, dst_stride_u, Abs(width), Abs(height), kFilterBilinear); ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v, dst_stride_v, Abs(width), Abs(height), kFilterBilinear); return 0; } // 422 chroma is 1/2 width, 1x height // 444 chroma is 1x width, 1x height LIBYUV_API int I422ToI444(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { if (width == 0 || height == 0) { return -1; } if (dst_y) { ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, Abs(width), Abs(height), kFilterBilinear); } ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u, dst_stride_u, Abs(width), Abs(height), kFilterBilinear); ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v, dst_stride_v, Abs(width), Abs(height), kFilterBilinear); return 0; } // Copy to I400. Source can be I420,422,444,400,NV12,NV21 LIBYUV_API int I400Copy(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int width, int height) { if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); return 0; } LIBYUV_API int I422ToYUY2(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_yuy2, int dst_stride_yuy2, int width, int height) { int y; void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width) = I422ToYUY2Row_C; if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; dst_stride_yuy2 = -dst_stride_yuy2; } // Coalesce rows. if (src_stride_y == width && src_stride_u * 2 == width && src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0; } #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_SSE2; } } #endif #if defined(HAS_I422TOYUY2ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; if (IS_ALIGNED(width, 32)) { I422ToYUY2Row = I422ToYUY2Row_AVX2; } } #endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_NEON; } } #endif for (y = 0; y < height; ++y) { I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; dst_yuy2 += dst_stride_yuy2; } return 0; } LIBYUV_API int I420ToYUY2(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_yuy2, int dst_stride_yuy2, int width, int height) { int y; void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width) = I422ToYUY2Row_C; if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; dst_stride_yuy2 = -dst_stride_yuy2; } #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_SSE2; } } #endif #if defined(HAS_I422TOYUY2ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; if (IS_ALIGNED(width, 32)) { I422ToYUY2Row = I422ToYUY2Row_AVX2; } } #endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_NEON; } } #endif #if defined(HAS_I422TOYUY2ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToYUY2Row = I422ToYUY2Row_Any_MMI; if (IS_ALIGNED(width, 8)) { I422ToYUY2Row = I422ToYUY2Row_MMI; } } #endif #if defined(HAS_I422TOYUY2ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToYUY2Row = I422ToYUY2Row_Any_MSA; if (IS_ALIGNED(width, 32)) { I422ToYUY2Row = I422ToYUY2Row_MSA; } } #endif for (y = 0; y < height - 1; y += 2) { I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); I422ToYUY2Row(src_y + src_stride_y, src_u, src_v, dst_yuy2 + dst_stride_yuy2, width); src_y += src_stride_y * 2; src_u += src_stride_u; src_v += src_stride_v; dst_yuy2 += dst_stride_yuy2 * 2; } if (height & 1) { I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width); } return 0; } LIBYUV_API int I422ToUYVY(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uyvy, int dst_stride_uyvy, int width, int height) { int y; void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width) = I422ToUYVYRow_C; if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; dst_stride_uyvy = -dst_stride_uyvy; } // Coalesce rows. if (src_stride_y == width && src_stride_u * 2 == width && src_stride_v * 2 == width && dst_stride_uyvy == width * 2) { width *= height; height = 1; src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0; } #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_SSE2; } } #endif #if defined(HAS_I422TOUYVYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { I422ToUYVYRow = I422ToUYVYRow_AVX2; } } #endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_NEON; } } #endif #if defined(HAS_I422TOUYVYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToUYVYRow = I422ToUYVYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { I422ToUYVYRow = I422ToUYVYRow_MMI; } } #endif #if defined(HAS_I422TOUYVYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToUYVYRow = I422ToUYVYRow_Any_MSA; if (IS_ALIGNED(width, 32)) { I422ToUYVYRow = I422ToUYVYRow_MSA; } } #endif for (y = 0; y < height; ++y) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); src_y += src_stride_y; src_u += src_stride_u; src_v += src_stride_v; dst_uyvy += dst_stride_uyvy; } return 0; } LIBYUV_API int I420ToUYVY(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uyvy, int dst_stride_uyvy, int width, int height) { int y; void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width) = I422ToUYVYRow_C; if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; dst_stride_uyvy = -dst_stride_uyvy; } #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_SSE2; } } #endif #if defined(HAS_I422TOUYVYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { I422ToUYVYRow = I422ToUYVYRow_AVX2; } } #endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_NEON; } } #endif #if defined(HAS_I422TOUYVYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToUYVYRow = I422ToUYVYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { I422ToUYVYRow = I422ToUYVYRow_MMI; } } #endif #if defined(HAS_I422TOUYVYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToUYVYRow = I422ToUYVYRow_Any_MSA; if (IS_ALIGNED(width, 32)) { I422ToUYVYRow = I422ToUYVYRow_MSA; } } #endif for (y = 0; y < height - 1; y += 2) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); I422ToUYVYRow(src_y + src_stride_y, src_u, src_v, dst_uyvy + dst_stride_uyvy, width); src_y += src_stride_y * 2; src_u += src_stride_u; src_v += src_stride_v; dst_uyvy += dst_stride_uyvy * 2; } if (height & 1) { I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width); } return 0; } LIBYUV_API int I420ToNV12(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int halfwidth = (width + 1) / 2; int halfheight = (height + 1) / 2; if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (halfheight - 1) * src_stride_u; src_v = src_v + (halfheight - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv, halfwidth, halfheight); return 0; } LIBYUV_API int I420ToNV21(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height) { return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u, src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu, width, height); } // Convert I420 to specified format LIBYUV_API int ConvertFromI420(const uint8_t* y, int y_stride, const uint8_t* u, int u_stride, const uint8_t* v, int v_stride, uint8_t* dst_sample, int dst_sample_stride, int width, int height, uint32_t fourcc) { uint32_t format = CanonicalFourCC(fourcc); int r = 0; if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) { return -1; } switch (format) { // Single plane formats case FOURCC_YUY2: r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 2, width, height); break; case FOURCC_UYVY: r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 2, width, height); break; case FOURCC_RGBP: r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 2, width, height); break; case FOURCC_RGBO: r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 2, width, height); break; case FOURCC_R444: r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 2, width, height); break; case FOURCC_24BG: r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 3, width, height); break; case FOURCC_RAW: r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 3, width, height); break; case FOURCC_ARGB: r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 4, width, height); break; case FOURCC_BGRA: r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 4, width, height); break; case FOURCC_ABGR: r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 4, width, height); break; case FOURCC_RGBA: r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 4, width, height); break; case FOURCC_AR30: r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width * 4, width, height); break; case FOURCC_I400: r = I400Copy(y, y_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width, width, height); break; case FOURCC_NV12: { int dst_y_stride = dst_sample_stride ? dst_sample_stride : width; uint8_t* dst_uv = dst_sample + dst_y_stride * height; r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width, dst_uv, dst_sample_stride ? dst_sample_stride : width, width, height); break; } case FOURCC_NV21: { int dst_y_stride = dst_sample_stride ? dst_sample_stride : width; uint8_t* dst_vu = dst_sample + dst_y_stride * height; r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride ? dst_sample_stride : width, dst_vu, dst_sample_stride ? dst_sample_stride : width, width, height); break; } // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; int halfstride = (dst_sample_stride + 1) / 2; int halfheight = (height + 1) / 2; uint8_t* dst_u; uint8_t* dst_v; if (format == FOURCC_YV12) { dst_v = dst_sample + dst_sample_stride * height; dst_u = dst_v + halfstride * halfheight; } else { dst_u = dst_sample + dst_sample_stride * height; dst_v = dst_u + halfstride * halfheight; } r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride, dst_u, halfstride, dst_v, halfstride, width, height); break; } case FOURCC_I422: case FOURCC_YV16: { dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; int halfstride = (dst_sample_stride + 1) / 2; uint8_t* dst_u; uint8_t* dst_v; if (format == FOURCC_YV16) { dst_v = dst_sample + dst_sample_stride * height; dst_u = dst_v + halfstride * height; } else { dst_u = dst_sample + dst_sample_stride * height; dst_v = dst_u + halfstride * height; } r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride, dst_u, halfstride, dst_v, halfstride, width, height); break; } case FOURCC_I444: case FOURCC_YV24: { dst_sample_stride = dst_sample_stride ? dst_sample_stride : width; uint8_t* dst_u; uint8_t* dst_v; if (format == FOURCC_YV24) { dst_v = dst_sample + dst_sample_stride * height; dst_u = dst_v + dst_sample_stride * height; } else { dst_u = dst_sample + dst_sample_stride * height; dst_v = dst_u + dst_sample_stride * height; } r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample, dst_sample_stride, dst_u, dst_sample_stride, dst_v, dst_sample_stride, width, height); break; } // Formats not supported - MJPG, biplanar, some rgb formats. default: return -1; // unknown fourcc - return failure code. } return r; } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/convert_from_argb.cc000066400000000000000000002000111416500237200230250ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/convert_from_argb.h" #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // ARGB little endian (bgra in memory) to I444 LIBYUV_API int ARGBToI444(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUV444Row_C; if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_y == width && dst_stride_u == width && dst_stride_v == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_ARGBTOUV444ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUV444Row = ARGBToUV444Row_SSSE3; } } #endif #if defined(HAS_ARGBTOUV444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUV444Row = ARGBToUV444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToUV444Row = ARGBToUV444Row_NEON; } } #endif #if defined(HAS_ARGBTOUV444ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToUV444Row = ARGBToUV444Row_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToUV444Row = ARGBToUV444Row_MMI; } } #endif #if defined(HAS_ARGBTOUV444ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToUV444Row = ARGBToUV444Row_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToUV444Row = ARGBToUV444Row_MSA; } } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } } #endif #if defined(HAS_ARGBTOYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } } #endif #if defined(HAS_ARGBTOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBToUV444Row(src_argb, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); src_argb += src_stride_argb; dst_y += dst_stride_y; dst_u += dst_stride_u; dst_v += dst_stride_v; } return 0; } // ARGB little endian (bgra in memory) to I422 LIBYUV_API int ARGBToI422(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_y == width && dst_stride_u * 2 == width && dst_stride_v * 2 == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } } #endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } } #endif #if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_MMI; } } #endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, dst_u, dst_v, width); ARGBToYRow(src_argb, dst_y, width); src_argb += src_stride_argb; dst_y += dst_stride_y; dst_u += dst_stride_u; dst_v += dst_stride_v; } return 0; } LIBYUV_API int ARGBToNV12(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; int halfwidth = (width + 1) >> 1; void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) = MergeUVRow_C; if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } } #endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } } #endif #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif #if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_MMI; } } #endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_MSA; } } #endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_SSE2; } } #endif #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { MergeUVRow_ = MergeUVRow_AVX2; } } #endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_NEON; } } #endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow_ = MergeUVRow_Any_MMI; if (IS_ALIGNED(halfwidth, 8)) { MergeUVRow_ = MergeUVRow_MMI; } } #endif #if defined(HAS_MERGEUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MergeUVRow_ = MergeUVRow_Any_MSA; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_MSA; } } #endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); MergeUVRow_(row_u, row_v, dst_uv, halfwidth); ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); src_argb += src_stride_argb * 2; dst_y += dst_stride_y * 2; dst_uv += dst_stride_uv; } if (height & 1) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); MergeUVRow_(row_u, row_v, dst_uv, halfwidth); ARGBToYRow(src_argb, dst_y, width); } free_aligned_buffer_64(row_u); } return 0; } // Same as NV12 but U and V swapped. LIBYUV_API int ARGBToNV21(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height) { int y; int halfwidth = (width + 1) >> 1; void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_vu, int width) = MergeUVRow_C; if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } } #endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } } #endif #if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_MMI; } } #endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_MSA; } } #endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_SSE2; } } #endif #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { MergeUVRow_ = MergeUVRow_AVX2; } } #endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_NEON; } } #endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow_ = MergeUVRow_Any_MMI; if (IS_ALIGNED(halfwidth, 8)) { MergeUVRow_ = MergeUVRow_MMI; } } #endif #if defined(HAS_MERGEUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MergeUVRow_ = MergeUVRow_Any_MSA; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_MSA; } } #endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); MergeUVRow_(row_v, row_u, dst_vu, halfwidth); ARGBToYRow(src_argb, dst_y, width); ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width); src_argb += src_stride_argb * 2; dst_y += dst_stride_y * 2; dst_vu += dst_stride_vu; } if (height & 1) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); MergeUVRow_(row_v, row_u, dst_vu, halfwidth); ARGBToYRow(src_argb, dst_y, width); } free_aligned_buffer_64(row_u); } return 0; } LIBYUV_API int ABGRToNV12(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; int halfwidth = (width + 1) >> 1; void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) = ABGRToUVRow_C; void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = ABGRToYRow_C; void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) = MergeUVRow_C; if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } #if defined(HAS_ABGRTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_SSSE3; } } #endif #if defined(HAS_ABGRTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ABGRToUVRow = ABGRToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ABGRToUVRow = ABGRToUVRow_SSSE3; } } #endif #if defined(HAS_ABGRTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ABGRToYRow = ABGRToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ABGRToYRow = ABGRToYRow_AVX2; } } #endif #if defined(HAS_ABGRTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ABGRToUVRow = ABGRToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ABGRToUVRow = ABGRToUVRow_AVX2; } } #endif #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_NEON; } } #endif #if defined(HAS_ABGRTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToUVRow = ABGRToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ABGRToUVRow = ABGRToUVRow_NEON; } } #endif #if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ABGRToYRow = ABGRToYRow_Any_MMI; ABGRToUVRow = ABGRToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_MMI; } if (IS_ALIGNED(width, 16)) { ABGRToUVRow = ABGRToUVRow_MMI; } } #endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; ABGRToUVRow = ABGRToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_MSA; } if (IS_ALIGNED(width, 32)) { ABGRToUVRow = ABGRToUVRow_MSA; } } #endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_SSE2; } } #endif #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { MergeUVRow_ = MergeUVRow_AVX2; } } #endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_NEON; } } #endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow_ = MergeUVRow_Any_MMI; if (IS_ALIGNED(halfwidth, 8)) { MergeUVRow_ = MergeUVRow_MMI; } } #endif #if defined(HAS_MERGEUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MergeUVRow_ = MergeUVRow_Any_MSA; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_MSA; } } #endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width); MergeUVRow_(row_u, row_v, dst_uv, halfwidth); ABGRToYRow(src_abgr, dst_y, width); ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); src_abgr += src_stride_abgr * 2; dst_y += dst_stride_y * 2; dst_uv += dst_stride_uv; } if (height & 1) { ABGRToUVRow(src_abgr, 0, row_u, row_v, width); MergeUVRow_(row_u, row_v, dst_uv, halfwidth); ABGRToYRow(src_abgr, dst_y, width); } free_aligned_buffer_64(row_u); } return 0; } // Same as NV12 but U and V swapped. LIBYUV_API int ABGRToNV21(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height) { int y; int halfwidth = (width + 1) >> 1; void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) = ABGRToUVRow_C; void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) = ABGRToYRow_C; void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_vu, int width) = MergeUVRow_C; if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } #if defined(HAS_ABGRTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ABGRToYRow = ABGRToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_SSSE3; } } #endif #if defined(HAS_ABGRTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ABGRToUVRow = ABGRToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ABGRToUVRow = ABGRToUVRow_SSSE3; } } #endif #if defined(HAS_ABGRTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ABGRToYRow = ABGRToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ABGRToYRow = ABGRToYRow_AVX2; } } #endif #if defined(HAS_ABGRTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ABGRToUVRow = ABGRToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ABGRToUVRow = ABGRToUVRow_AVX2; } } #endif #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_NEON; } } #endif #if defined(HAS_ABGRTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToUVRow = ABGRToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ABGRToUVRow = ABGRToUVRow_NEON; } } #endif #if defined(HAS_ABGRTOYROW_MMI) && defined(HAS_ABGRTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ABGRToYRow = ABGRToYRow_Any_MMI; ABGRToUVRow = ABGRToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_MMI; } if (IS_ALIGNED(width, 16)) { ABGRToUVRow = ABGRToUVRow_MMI; } } #endif #if defined(HAS_ABGRTOYROW_MSA) && defined(HAS_ABGRTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ABGRToYRow = ABGRToYRow_Any_MSA; ABGRToUVRow = ABGRToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ABGRToYRow = ABGRToYRow_MSA; } if (IS_ALIGNED(width, 32)) { ABGRToUVRow = ABGRToUVRow_MSA; } } #endif #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow_ = MergeUVRow_Any_SSE2; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_SSE2; } } #endif #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_ = MergeUVRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { MergeUVRow_ = MergeUVRow_AVX2; } } #endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_ = MergeUVRow_Any_NEON; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_NEON; } } #endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow_ = MergeUVRow_Any_MMI; if (IS_ALIGNED(halfwidth, 8)) { MergeUVRow_ = MergeUVRow_MMI; } } #endif #if defined(HAS_MERGEUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MergeUVRow_ = MergeUVRow_Any_MSA; if (IS_ALIGNED(halfwidth, 16)) { MergeUVRow_ = MergeUVRow_MSA; } } #endif { // Allocate a rows of uv. align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); uint8_t* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width); MergeUVRow_(row_v, row_u, dst_vu, halfwidth); ABGRToYRow(src_abgr, dst_y, width); ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width); src_abgr += src_stride_abgr * 2; dst_y += dst_stride_y * 2; dst_vu += dst_stride_vu; } if (height & 1) { ABGRToUVRow(src_abgr, 0, row_u, row_v, width); MergeUVRow_(row_v, row_u, dst_vu, halfwidth); ABGRToYRow(src_abgr, dst_y, width); } free_aligned_buffer_64(row_u); } return 0; } // Convert ARGB to YUY2. LIBYUV_API int ARGBToYUY2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yuy2, int dst_stride_yuy2, int width, int height) { int y; void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width) = I422ToYUY2Row_C; if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; dst_stride_yuy2 = -dst_stride_yuy2; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_yuy2 = 0; } #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } } #endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } } #endif #if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_MMI; } } #endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_MSA; } } #endif #if defined(HAS_I422TOYUY2ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToYUY2Row = I422ToYUY2Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_SSE2; } } #endif #if defined(HAS_I422TOYUY2ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToYUY2Row = I422ToYUY2Row_Any_AVX2; if (IS_ALIGNED(width, 32)) { I422ToYUY2Row = I422ToYUY2Row_AVX2; } } #endif #if defined(HAS_I422TOYUY2ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToYUY2Row = I422ToYUY2Row_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToYUY2Row = I422ToYUY2Row_NEON; } } #endif #if defined(HAS_I422TOYUY2ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToYUY2Row = I422ToYUY2Row_Any_MMI; if (IS_ALIGNED(width, 8)) { I422ToYUY2Row = I422ToYUY2Row_MMI; } } #endif #if defined(HAS_I422TOYUY2ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToYUY2Row = I422ToYUY2Row_Any_MSA; if (IS_ALIGNED(width, 32)) { I422ToYUY2Row = I422ToYUY2Row_MSA; } } #endif { // Allocate a rows of yuv. align_buffer_64(row_y, ((width + 63) & ~63) * 2); uint8_t* row_u = row_y + ((width + 63) & ~63); uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); ARGBToYRow(src_argb, row_y, width); I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width); src_argb += src_stride_argb; dst_yuy2 += dst_stride_yuy2; } free_aligned_buffer_64(row_y); } return 0; } // Convert ARGB to UYVY. LIBYUV_API int ARGBToUYVY(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_uyvy, int dst_stride_uyvy, int width, int height) { int y; void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVRow_C; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width) = I422ToUYVYRow_C; if (!src_argb || !dst_uyvy || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; dst_stride_uyvy = -dst_stride_uyvy; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_uyvy = 0; } #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToUVRow = ARGBToUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_AVX2; } } #endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } } #endif #if defined(HAS_ARGBTOUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } } #endif #if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; ARGBToUVRow = ARGBToUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_MMI; } } #endif #if defined(HAS_ARGBTOYROW_MSA) && defined(HAS_ARGBTOUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; ARGBToUVRow = ARGBToUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } if (IS_ALIGNED(width, 32)) { ARGBToUVRow = ARGBToUVRow_MSA; } } #endif #if defined(HAS_I422TOUYVYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { I422ToUYVYRow = I422ToUYVYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_SSE2; } } #endif #if defined(HAS_I422TOUYVYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToUYVYRow = I422ToUYVYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { I422ToUYVYRow = I422ToUYVYRow_AVX2; } } #endif #if defined(HAS_I422TOUYVYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToUYVYRow = I422ToUYVYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { I422ToUYVYRow = I422ToUYVYRow_NEON; } } #endif #if defined(HAS_I422TOUYVYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToUYVYRow = I422ToUYVYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { I422ToUYVYRow = I422ToUYVYRow_MMI; } } #endif #if defined(HAS_I422TOUYVYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToUYVYRow = I422ToUYVYRow_Any_MSA; if (IS_ALIGNED(width, 32)) { I422ToUYVYRow = I422ToUYVYRow_MSA; } } #endif { // Allocate a rows of yuv. align_buffer_64(row_y, ((width + 63) & ~63) * 2); uint8_t* row_u = row_y + ((width + 63) & ~63); uint8_t* row_v = row_u + ((width + 63) & ~63) / 2; for (y = 0; y < height; ++y) { ARGBToUVRow(src_argb, 0, row_u, row_v, width); ARGBToYRow(src_argb, row_y, width); I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width); src_argb += src_stride_argb; dst_uyvy += dst_stride_uyvy; } free_aligned_buffer_64(row_y); } return 0; } // Convert ARGB to I400. LIBYUV_API int ARGBToI400(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, int width, int height) { int y; void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) = ARGBToYRow_C; if (!src_argb || !dst_y || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_y == width) { width *= height; height = 1; src_stride_argb = dst_stride_y = 0; } #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYRow = ARGBToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYRow = ARGBToYRow_AVX2; } } #endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; } } #endif #if defined(HAS_ARGBTOYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYRow = ARGBToYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_MMI; } } #endif #if defined(HAS_ARGBTOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYRow = ARGBToYRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYRow = ARGBToYRow_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBToYRow(src_argb, dst_y, width); src_argb += src_stride_argb; dst_y += dst_stride_y; } return 0; } // Shuffle table for converting ARGB to RGBA. static const uvec8 kShuffleMaskARGBToRGBA = { 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u}; // Convert ARGB to RGBA. LIBYUV_API int ARGBToRGBA(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_rgba, int dst_stride_rgba, int width, int height) { return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba, (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height); } // Convert ARGB To RGB24. LIBYUV_API int ARGBToRGB24(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height) { int y; void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToRGB24Row_C; if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) { width *= height; height = 1; src_stride_argb = dst_stride_rgb24 = 0; } #if defined(HAS_ARGBTORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; } } #endif #if defined(HAS_ARGBTORGB24ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToRGB24Row = ARGBToRGB24Row_AVX2; } } #endif #if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) if (TestCpuFlag(kCpuHasAVX512VBMI)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI; if (IS_ALIGNED(width, 32)) { ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI; } } #endif #if defined(HAS_ARGBTORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_NEON; } } #endif #if defined(HAS_ARGBTORGB24ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_MMI; if (IS_ALIGNED(width, 4)) { ARGBToRGB24Row = ARGBToRGB24Row_MMI; } } #endif #if defined(HAS_ARGBTORGB24ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); src_argb += src_stride_argb; dst_rgb24 += dst_stride_rgb24; } return 0; } // Convert ARGB To RAW. LIBYUV_API int ARGBToRAW(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_raw, int dst_stride_raw, int width, int height) { int y; void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToRAWRow_C; if (!src_argb || !dst_raw || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) { width *= height; height = 1; src_stride_argb = dst_stride_raw = 0; } #if defined(HAS_ARGBTORAWROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToRAWRow = ARGBToRAWRow_SSSE3; } } #endif #if defined(HAS_ARGBTORAWROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToRAWRow = ARGBToRAWRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToRAWRow = ARGBToRAWRow_AVX2; } } #endif #if defined(HAS_ARGBTORAWROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRAWRow = ARGBToRAWRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToRAWRow = ARGBToRAWRow_NEON; } } #endif #if defined(HAS_ARGBTORAWROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToRAWRow = ARGBToRAWRow_Any_MMI; if (IS_ALIGNED(width, 4)) { ARGBToRAWRow = ARGBToRAWRow_MMI; } } #endif #if defined(HAS_ARGBTORAWROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToRAWRow = ARGBToRAWRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToRAWRow = ARGBToRAWRow_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); src_argb += src_stride_argb; dst_raw += dst_stride_raw; } return 0; } // Ordered 8x8 dither for 888 to 565. Values from 0 to 7. static const uint8_t kDither565_4x4[16] = { 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). LIBYUV_API int ARGBToRGB565Dither(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_rgb565, int dst_stride_rgb565, const uint8_t* dither4x4, int width, int height) { int y; void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width) = ARGBToRGB565DitherRow_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } if (!dither4x4) { dither4x4 = kDither565_4x4; } #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; } } #endif #if defined(HAS_ARGBTORGB565DITHERROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; } } #endif #if defined(HAS_ARGBTORGB565DITHERROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; } } #endif #if defined(HAS_ARGBTORGB565DITHERROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MMI; if (IS_ALIGNED(width, 4)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MMI; } } #endif #if defined(HAS_ARGBTORGB565DITHERROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBToRGB565DitherRow(src_argb, dst_rgb565, *(const uint32_t*)(dither4x4 + ((y & 3) << 2)), width); src_argb += src_stride_argb; dst_rgb565 += dst_stride_rgb565; } return 0; } // Convert ARGB To RGB565. // TODO(fbarchard): Consider using dither function low level with zeros. LIBYUV_API int ARGBToRGB565(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_rgb565, int dst_stride_rgb565, int width, int height) { int y; void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToRGB565Row_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_rgb565 = 0; } #if defined(HAS_ARGBTORGB565ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } } #endif #if defined(HAS_ARGBTORGB565ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBToRGB565Row = ARGBToRGB565Row_AVX2; } } #endif #if defined(HAS_ARGBTORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToRGB565Row = ARGBToRGB565Row_NEON; } } #endif #if defined(HAS_ARGBTORGB565ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_MMI; if (IS_ALIGNED(width, 4)) { ARGBToRGB565Row = ARGBToRGB565Row_MMI; } } #endif #if defined(HAS_ARGBTORGB565ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBToRGB565Row = ARGBToRGB565Row_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBToRGB565Row(src_argb, dst_rgb565, width); src_argb += src_stride_argb; dst_rgb565 += dst_stride_rgb565; } return 0; } // Convert ARGB To ARGB1555. LIBYUV_API int ARGBToARGB1555(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb1555, int dst_stride_argb1555, int width, int height) { int y; void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToARGB1555Row_C; if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_argb1555 = 0; } #if defined(HAS_ARGBTOARGB1555ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; } } #endif #if defined(HAS_ARGBTOARGB1555ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2; } } #endif #if defined(HAS_ARGBTOARGB1555ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToARGB1555Row = ARGBToARGB1555Row_NEON; } } #endif #if defined(HAS_ARGBTOARGB1555ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MMI; if (IS_ALIGNED(width, 4)) { ARGBToARGB1555Row = ARGBToARGB1555Row_MMI; } } #endif #if defined(HAS_ARGBTOARGB1555ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBToARGB1555Row = ARGBToARGB1555Row_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBToARGB1555Row(src_argb, dst_argb1555, width); src_argb += src_stride_argb; dst_argb1555 += dst_stride_argb1555; } return 0; } // Convert ARGB To ARGB4444. LIBYUV_API int ARGBToARGB4444(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb4444, int dst_stride_argb4444, int width, int height) { int y; void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToARGB4444Row_C; if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) { width *= height; height = 1; src_stride_argb = dst_stride_argb4444 = 0; } #if defined(HAS_ARGBTOARGB4444ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; } } #endif #if defined(HAS_ARGBTOARGB4444ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBToARGB4444Row = ARGBToARGB4444Row_AVX2; } } #endif #if defined(HAS_ARGBTOARGB4444ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToARGB4444Row = ARGBToARGB4444Row_NEON; } } #endif #if defined(HAS_ARGBTOARGB4444ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MMI; if (IS_ALIGNED(width, 4)) { ARGBToARGB4444Row = ARGBToARGB4444Row_MMI; } } #endif #if defined(HAS_ARGBTOARGB4444ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBToARGB4444Row = ARGBToARGB4444Row_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBToARGB4444Row(src_argb, dst_argb4444, width); src_argb += src_stride_argb; dst_argb4444 += dst_stride_argb4444; } return 0; } // Convert ABGR To AR30. LIBYUV_API int ABGRToAR30(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height) { int y; void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) = ABGRToAR30Row_C; if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_abgr = src_abgr + (height - 1) * src_stride_abgr; src_stride_abgr = -src_stride_abgr; } // Coalesce rows. if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) { width *= height; height = 1; src_stride_abgr = dst_stride_ar30 = 0; } #if defined(HAS_ABGRTOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ABGRToAR30Row = ABGRToAR30Row_SSSE3; } } #endif #if defined(HAS_ABGRTOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ABGRToAR30Row = ABGRToAR30Row_Any_AVX2; if (IS_ALIGNED(width, 8)) { ABGRToAR30Row = ABGRToAR30Row_AVX2; } } #endif for (y = 0; y < height; ++y) { ABGRToAR30Row(src_abgr, dst_ar30, width); src_abgr += src_stride_abgr; dst_ar30 += dst_stride_ar30; } return 0; } // Convert ARGB To AR30. LIBYUV_API int ARGBToAR30(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height) { int y; void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) = ARGBToAR30Row_C; if (!src_argb || !dst_ar30 || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_ar30 = 0; } #if defined(HAS_ARGBTOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBToAR30Row = ARGBToAR30Row_SSSE3; } } #endif #if defined(HAS_ARGBTOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBToAR30Row = ARGBToAR30Row_AVX2; } } #endif for (y = 0; y < height; ++y) { ARGBToAR30Row(src_argb, dst_ar30, width); src_argb += src_stride_argb; dst_ar30 += dst_stride_ar30; } return 0; } // Convert ARGB to J420. (JPeg full range I420). LIBYUV_API int ARGBToJ420(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYJRow = ARGBToYJRow_AVX2; } } #endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_NEON; } } #endif #if defined(HAS_ARGBTOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVJRow = ARGBToUVJRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVJRow = ARGBToUVJRow_NEON; } } #endif #if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYJRow = ARGBToYJRow_Any_MMI; ARGBToUVJRow = ARGBToUVJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_MMI; } if (IS_ALIGNED(width, 16)) { ARGBToUVJRow = ARGBToUVJRow_MMI; } } #endif #if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYJRow = ARGBToYJRow_Any_MSA; ARGBToUVJRow = ARGBToUVJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_MSA; } if (IS_ALIGNED(width, 32)) { ARGBToUVJRow = ARGBToUVJRow_MSA; } } #endif for (y = 0; y < height - 1; y += 2) { ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width); ARGBToYJRow(src_argb, dst_yj, width); ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width); src_argb += src_stride_argb * 2; dst_yj += dst_stride_yj * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); ARGBToYJRow(src_argb, dst_yj, width); } return 0; } // Convert ARGB to J422. (JPeg full range I422). LIBYUV_API int ARGBToJ422(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_yj == width && dst_stride_u * 2 == width && dst_stride_v * 2 == width) { width *= height; height = 1; src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOUVJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVJRow = ARGBToUVJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYJRow = ARGBToYJRow_AVX2; } } #endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_NEON; } } #endif #if defined(HAS_ARGBTOUVJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToUVJRow = ARGBToUVJRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVJRow = ARGBToUVJRow_NEON; } } #endif #if defined(HAS_ARGBTOYJROW_MMI) && defined(HAS_ARGBTOUVJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYJRow = ARGBToYJRow_Any_MMI; ARGBToUVJRow = ARGBToUVJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_MMI; } if (IS_ALIGNED(width, 16)) { ARGBToUVJRow = ARGBToUVJRow_MMI; } } #endif #if defined(HAS_ARGBTOYJROW_MSA) && defined(HAS_ARGBTOUVJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYJRow = ARGBToYJRow_Any_MSA; ARGBToUVJRow = ARGBToUVJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_MSA; } if (IS_ALIGNED(width, 32)) { ARGBToUVJRow = ARGBToUVJRow_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width); ARGBToYJRow(src_argb, dst_yj, width); src_argb += src_stride_argb; dst_yj += dst_stride_yj; dst_u += dst_stride_u; dst_v += dst_stride_v; } return 0; } // Convert ARGB to AR64. LIBYUV_API int ARGBToAR64(const uint8_t* src_argb, int src_stride_argb, uint16_t* dst_ar64, int dst_stride_ar64, int width, int height) { int y; void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, int width) = ARGBToAR64Row_C; if (!src_argb || !dst_ar64 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_ar64 = 0; } #if defined(HAS_ARGBTOAR64ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBToAR64Row = ARGBToAR64Row_SSSE3; } } #endif #if defined(HAS_ARGBTOAR64ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToAR64Row = ARGBToAR64Row_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBToAR64Row = ARGBToAR64Row_AVX2; } } #endif #if defined(HAS_ARGBTOAR64ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToAR64Row = ARGBToAR64Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToAR64Row = ARGBToAR64Row_NEON; } } #endif for (y = 0; y < height; ++y) { ARGBToAR64Row(src_argb, dst_ar64, width); src_argb += src_stride_argb; dst_ar64 += dst_stride_ar64; } return 0; } // Convert ARGB to AB64. LIBYUV_API int ARGBToAB64(const uint8_t* src_argb, int src_stride_argb, uint16_t* dst_ab64, int dst_stride_ab64, int width, int height) { int y; void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, int width) = ARGBToAB64Row_C; if (!src_argb || !dst_ab64 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_ab64 = 0; } #if defined(HAS_ARGBTOAB64ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBToAB64Row = ARGBToAB64Row_SSSE3; } } #endif #if defined(HAS_ARGBTOAB64ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToAB64Row = ARGBToAB64Row_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBToAB64Row = ARGBToAB64Row_AVX2; } } #endif #if defined(HAS_ARGBTOAB64ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToAB64Row = ARGBToAB64Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToAB64Row = ARGBToAB64Row_NEON; } } #endif for (y = 0; y < height; ++y) { ARGBToAB64Row(src_argb, dst_ab64, width); src_argb += src_stride_argb; dst_ab64 += dst_stride_ab64; } return 0; } // Convert ARGB to J400. LIBYUV_API int ARGBToJ400(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_yj, int dst_stride_yj, int width, int height) { int y; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) = ARGBToYJRow_C; if (!src_argb || !dst_yj || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_yj == width) { width *= height; height = 1; src_stride_argb = dst_stride_yj = 0; } #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYJRow = ARGBToYJRow_AVX2; } } #endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_NEON; } } #endif #if defined(HAS_ARGBTOYJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYJRow = ARGBToYJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_MMI; } } #endif #if defined(HAS_ARGBTOYJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYJRow = ARGBToYJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBToYJRow(src_argb, dst_yj, width); src_argb += src_stride_argb; dst_yj += dst_stride_yj; } return 0; } // Convert RGBA to J400. LIBYUV_API int RGBAToJ400(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_yj, int dst_stride_yj, int width, int height) { int y; void (*RGBAToYJRow)(const uint8_t* src_rgba, uint8_t* dst_yj, int width) = RGBAToYJRow_C; if (!src_rgba || !dst_yj || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_rgba = src_rgba + (height - 1) * src_stride_rgba; src_stride_rgba = -src_stride_rgba; } // Coalesce rows. if (src_stride_rgba == width * 4 && dst_stride_yj == width) { width *= height; height = 1; src_stride_rgba = dst_stride_yj = 0; } #if defined(HAS_RGBATOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGBAToYJRow = RGBAToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGBAToYJRow = RGBAToYJRow_SSSE3; } } #endif #if defined(HAS_RGBATOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { RGBAToYJRow = RGBAToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { RGBAToYJRow = RGBAToYJRow_AVX2; } } #endif #if defined(HAS_RGBATOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGBAToYJRow = RGBAToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGBAToYJRow = RGBAToYJRow_NEON; } } #endif #if defined(HAS_RGBATOYJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RGBAToYJRow = RGBAToYJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { RGBAToYJRow = RGBAToYJRow_MMI; } } #endif #if defined(HAS_RGBATOYJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RGBAToYJRow = RGBAToYJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { RGBAToYJRow = RGBAToYJRow_MSA; } } #endif for (y = 0; y < height; ++y) { RGBAToYJRow(src_rgba, dst_yj, width); src_rgba += src_stride_rgba; dst_yj += dst_stride_yj; } return 0; } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/convert_jpeg.cc000066400000000000000000000562131416500237200220310ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/convert.h" #include "libyuv/convert_argb.h" #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" #endif #ifdef __cplusplus namespace libyuv { extern "C" { #endif #ifdef HAVE_JPEG struct I420Buffers { uint8_t* y; int y_stride; uint8_t* u; int u_stride; uint8_t* v; int v_stride; int w; int h; }; static void JpegCopyI420(void* opaque, const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2], dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; dest->h -= rows; } static void JpegI422ToI420(void* opaque, const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; dest->h -= rows; } static void JpegI444ToI420(void* opaque, const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2], dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; dest->h -= rows; } static void JpegI400ToI420(void* opaque, const uint8_t* const* data, const int* strides, int rows) { I420Buffers* dest = (I420Buffers*)(opaque); I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v, dest->v_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->u += ((rows + 1) >> 1) * dest->u_stride; dest->v += ((rows + 1) >> 1) * dest->v_stride; dest->h -= rows; } // Query size of MJPG in pixels. LIBYUV_API int MJPGSize(const uint8_t* src_mjpg, size_t src_size_mjpg, int* width, int* height) { MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); if (ret) { *width = mjpeg_decoder.GetWidth(); *height = mjpeg_decoder.GetHeight(); } mjpeg_decoder.UnloadFrame(); return ret ? 0 : -1; // -1 for runtime failure. } // MJPG (Motion JPeg) to I420 // TODO(fbarchard): review src_width and src_height requirement. dst_width and // dst_height may be enough. LIBYUV_API int MJPGToI420(const uint8_t* src_mjpg, size_t src_size_mjpg, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int src_width, int src_height, int dst_width, int dst_height) { if (src_size_mjpg == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; } // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); if (ret && (mjpeg_decoder.GetWidth() != src_width || mjpeg_decoder.GetHeight() != src_height)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { I420Buffers bufs = {dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width, dst_height}; // YUV420 if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 2 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width, dst_height); // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width, dst_height); // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width, dst_height); // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width, dst_height); } else { // TODO(fbarchard): Implement conversion for any other // colorspace/subsample factors that occur in practice. ERROR: Unable to // convert MJPEG frame because format is not supported mjpeg_decoder.UnloadFrame(); return 1; } } return ret ? 0 : 1; } struct NV21Buffers { uint8_t* y; int y_stride; uint8_t* vu; int vu_stride; int w; int h; }; static void JpegI420ToNV21(void* opaque, const uint8_t* const* data, const int* strides, int rows) { NV21Buffers* dest = (NV21Buffers*)(opaque); I420ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->vu += ((rows + 1) >> 1) * dest->vu_stride; dest->h -= rows; } static void JpegI422ToNV21(void* opaque, const uint8_t* const* data, const int* strides, int rows) { NV21Buffers* dest = (NV21Buffers*)(opaque); I422ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->vu += ((rows + 1) >> 1) * dest->vu_stride; dest->h -= rows; } static void JpegI444ToNV21(void* opaque, const uint8_t* const* data, const int* strides, int rows) { NV21Buffers* dest = (NV21Buffers*)(opaque); I444ToNV21(data[0], strides[0], data[1], strides[1], data[2], strides[2], dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->vu += ((rows + 1) >> 1) * dest->vu_stride; dest->h -= rows; } static void JpegI400ToNV21(void* opaque, const uint8_t* const* data, const int* strides, int rows) { NV21Buffers* dest = (NV21Buffers*)(opaque); I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->vu += ((rows + 1) >> 1) * dest->vu_stride; dest->h -= rows; } // MJPG (Motion JPeg) to NV21 LIBYUV_API int MJPGToNV21(const uint8_t* src_mjpg, size_t src_size_mjpg, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int src_width, int src_height, int dst_width, int dst_height) { if (src_size_mjpg == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; } // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); if (ret && (mjpeg_decoder.GetWidth() != src_width || mjpeg_decoder.GetHeight() != src_height)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { NV21Buffers bufs = {dst_y, dst_stride_y, dst_vu, dst_stride_vu, dst_width, dst_height}; // YUV420 if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 2 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV21, &bufs, dst_width, dst_height); // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV21, &bufs, dst_width, dst_height); // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV21, &bufs, dst_width, dst_height); // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV21, &bufs, dst_width, dst_height); } else { // Unknown colorspace. mjpeg_decoder.UnloadFrame(); return 1; } } return ret ? 0 : 1; } static void JpegI420ToNV12(void* opaque, const uint8_t* const* data, const int* strides, int rows) { NV21Buffers* dest = (NV21Buffers*)(opaque); // Use NV21 with VU swapped. I420ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->vu += ((rows + 1) >> 1) * dest->vu_stride; dest->h -= rows; } static void JpegI422ToNV12(void* opaque, const uint8_t* const* data, const int* strides, int rows) { NV21Buffers* dest = (NV21Buffers*)(opaque); // Use NV21 with VU swapped. I422ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->vu += ((rows + 1) >> 1) * dest->vu_stride; dest->h -= rows; } static void JpegI444ToNV12(void* opaque, const uint8_t* const* data, const int* strides, int rows) { NV21Buffers* dest = (NV21Buffers*)(opaque); // Use NV21 with VU swapped. I444ToNV21(data[0], strides[0], data[2], strides[2], data[1], strides[1], dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->vu += ((rows + 1) >> 1) * dest->vu_stride; dest->h -= rows; } static void JpegI400ToNV12(void* opaque, const uint8_t* const* data, const int* strides, int rows) { NV21Buffers* dest = (NV21Buffers*)(opaque); // Use NV21 since there is no UV plane. I400ToNV21(data[0], strides[0], dest->y, dest->y_stride, dest->vu, dest->vu_stride, dest->w, rows); dest->y += rows * dest->y_stride; dest->vu += ((rows + 1) >> 1) * dest->vu_stride; dest->h -= rows; } // MJPG (Motion JPEG) to NV12. LIBYUV_API int MJPGToNV12(const uint8_t* sample, size_t sample_size, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int src_width, int src_height, int dst_width, int dst_height) { if (sample_size == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; } // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); if (ret && (mjpeg_decoder.GetWidth() != src_width || mjpeg_decoder.GetHeight() != src_height)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { // Use NV21Buffers but with UV instead of VU. NV21Buffers bufs = {dst_y, dst_stride_y, dst_uv, dst_stride_uv, dst_width, dst_height}; // YUV420 if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 2 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToNV12, &bufs, dst_width, dst_height); // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToNV12, &bufs, dst_width, dst_height); // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToNV12, &bufs, dst_width, dst_height); // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToNV12, &bufs, dst_width, dst_height); } else { // Unknown colorspace. mjpeg_decoder.UnloadFrame(); return 1; } } return ret ? 0 : 1; } struct ARGBBuffers { uint8_t* argb; int argb_stride; int w; int h; }; static void JpegI420ToARGB(void* opaque, const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } static void JpegI422ToARGB(void* opaque, const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } static void JpegI444ToARGB(void* opaque, const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2], dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } static void JpegI400ToARGB(void* opaque, const uint8_t* const* data, const int* strides, int rows) { ARGBBuffers* dest = (ARGBBuffers*)(opaque); I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows); dest->argb += rows * dest->argb_stride; dest->h -= rows; } // MJPG (Motion JPeg) to ARGB // TODO(fbarchard): review src_width and src_height requirement. dst_width and // dst_height may be enough. LIBYUV_API int MJPGToARGB(const uint8_t* src_mjpg, size_t src_size_mjpg, uint8_t* dst_argb, int dst_stride_argb, int src_width, int src_height, int dst_width, int dst_height) { if (src_size_mjpg == kUnknownDataSize) { // ERROR: MJPEG frame size unknown return -1; } // TODO(fbarchard): Port MJpeg to C. MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(src_mjpg, src_size_mjpg); if (ret && (mjpeg_decoder.GetWidth() != src_width || mjpeg_decoder.GetHeight() != src_height)) { // ERROR: MJPEG frame has unexpected dimensions mjpeg_decoder.UnloadFrame(); return 1; // runtime failure } if (ret) { ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height}; // YUV420 if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 2 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width, dst_height); // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width, dst_height); // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width, dst_height); // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1) { ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width, dst_height); } else { // TODO(fbarchard): Implement conversion for any other // colorspace/subsample factors that occur in practice. ERROR: Unable to // convert MJPEG frame because format is not supported mjpeg_decoder.UnloadFrame(); return 1; } } return ret ? 0 : 1; } #endif // HAVE_JPEG #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/convert_to_argb.cc000066400000000000000000000356451416500237200225270ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/convert_argb.h" #include "libyuv/cpu_id.h" #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" #endif #include "libyuv/rotate_argb.h" #include "libyuv/row.h" #include "libyuv/video_common.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Convert camera sample to ARGB with cropping, rotation and vertical flip. // src_width is used for source stride computation // src_height is used to compute location of planes, and indicate inversion // sample_size is measured in bytes and is the size of the frame. // With MJPEG it is the compressed size of the frame. // TODO(fbarchard): Add the following: // H010ToARGB // I010ToARGB LIBYUV_API int ConvertToARGB(const uint8_t* sample, size_t sample_size, uint8_t* dst_argb, int dst_stride_argb, int crop_x, int crop_y, int src_width, int src_height, int crop_width, int crop_height, enum RotationMode rotation, uint32_t fourcc) { uint32_t format = CanonicalFourCC(fourcc); int aligned_src_width = (src_width + 1) & ~1; const uint8_t* src; const uint8_t* src_uv; int abs_src_height = (src_height < 0) ? -src_height : src_height; int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height; int r = 0; // One pass rotation is available for some formats. For the rest, convert // to ARGB (with optional vertical flipping) into a temporary ARGB buffer, // and then rotate the ARGB to the final destination buffer. // For in-place conversion, if destination dst_argb is same as source sample, // also enable temporary buffer. LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) || dst_argb == sample; uint8_t* dest_argb = dst_argb; int dest_dst_stride_argb = dst_stride_argb; uint8_t* rotate_buffer = NULL; int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 || src_height == 0 || crop_height == 0) { return -1; } if (src_height < 0) { inv_crop_height = -inv_crop_height; } if (need_buf) { int argb_size = crop_width * 4 * abs_crop_height; rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */ if (!rotate_buffer) { return 1; // Out of memory runtime error. } dst_argb = rotate_buffer; dst_stride_argb = crop_width * 4; } switch (format) { // Single plane formats case FOURCC_YUY2: src = sample + (aligned_src_width * crop_y + crop_x) * 2; r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_UYVY: src = sample + (aligned_src_width * crop_y + crop_x) * 2; r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_24BG: src = sample + (src_width * crop_y + crop_x) * 3; r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_RAW: src = sample + (src_width * crop_y + crop_x) * 3; r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_ARGB: if (!need_buf && !rotation) { src = sample + (src_width * crop_y + crop_x) * 4; r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, inv_crop_height); } break; case FOURCC_BGRA: src = sample + (src_width * crop_y + crop_x) * 4; r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_ABGR: src = sample + (src_width * crop_y + crop_x) * 4; r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_RGBA: src = sample + (src_width * crop_y + crop_x) * 4; r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_AR30: src = sample + (src_width * crop_y + crop_x) * 4; r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_AB30: src = sample + (src_width * crop_y + crop_x) * 4; r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_RGBO: src = sample + (src_width * crop_y + crop_x) * 2; r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_R444: src = sample + (src_width * crop_y + crop_x) * 2; r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_I400: src = sample + src_width * crop_y + crop_x; r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_J400: src = sample + src_width * crop_y + crop_x; r = J400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; // Biplanar formats case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; case FOURCC_NV21: src = sample + (src_width * crop_y + crop_x); src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x; // Call NV12 but with u and v parameters swapped. r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { const uint8_t* src_y = sample + (src_width * crop_y + crop_x); const uint8_t* src_u; const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; if (format == FOURCC_YV12) { src_v = sample + src_width * abs_src_height + (halfwidth * crop_y + crop_x) / 2; src_u = sample + src_width * abs_src_height + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } else { src_u = sample + src_width * abs_src_height + (halfwidth * crop_y + crop_x) / 2; src_v = sample + src_width * abs_src_height + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; } r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_J420: { int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; const uint8_t* src_y = sample + (src_width * crop_y + crop_x); const uint8_t* src_u = sample + src_width * abs_src_height + (halfwidth * crop_y + crop_x) / 2; const uint8_t* src_v = sample + src_width * abs_src_height + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_H420: { int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; const uint8_t* src_y = sample + (src_width * crop_y + crop_x); const uint8_t* src_u = sample + src_width * abs_src_height + (halfwidth * crop_y + crop_x) / 2; const uint8_t* src_v = sample + src_width * abs_src_height + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; r = H420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_U420: { int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; const uint8_t* src_y = sample + (src_width * crop_y + crop_x); const uint8_t* src_u = sample + src_width * abs_src_height + (halfwidth * crop_y + crop_x) / 2; const uint8_t* src_v = sample + src_width * abs_src_height + halfwidth * (halfheight + crop_y / 2) + crop_x / 2; r = U420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_I422: case FOURCC_YV16: { int halfwidth = (src_width + 1) / 2; const uint8_t* src_y = sample + src_width * crop_y + crop_x; const uint8_t* src_u; const uint8_t* src_v; if (format == FOURCC_YV16) { src_v = sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; src_u = sample + src_width * abs_src_height + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } else { src_u = sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; src_v = sample + src_width * abs_src_height + halfwidth * (abs_src_height + crop_y) + crop_x / 2; } r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_J422: { int halfwidth = (src_width + 1) / 2; const uint8_t* src_y = sample + src_width * crop_y + crop_x; const uint8_t* src_u = sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; const uint8_t* src_v = sample + src_width * abs_src_height + halfwidth * (abs_src_height + crop_y) + crop_x / 2; r = J422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_H422: { int halfwidth = (src_width + 1) / 2; const uint8_t* src_y = sample + src_width * crop_y + crop_x; const uint8_t* src_u = sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; const uint8_t* src_v = sample + src_width * abs_src_height + halfwidth * (abs_src_height + crop_y) + crop_x / 2; r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_U422: { int halfwidth = (src_width + 1) / 2; const uint8_t* src_y = sample + src_width * crop_y + crop_x; const uint8_t* src_u = sample + src_width * abs_src_height + halfwidth * crop_y + crop_x / 2; const uint8_t* src_v = sample + src_width * abs_src_height + halfwidth * (abs_src_height + crop_y) + crop_x / 2; r = H422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_I444: case FOURCC_YV24: { const uint8_t* src_y = sample + src_width * crop_y + crop_x; const uint8_t* src_u; const uint8_t* src_v; if (format == FOURCC_YV24) { src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } else { src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_J444: { const uint8_t* src_y = sample + src_width * crop_y + crop_x; const uint8_t* src_u; const uint8_t* src_v; src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; r = J444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_H444: { const uint8_t* src_y = sample + src_width * crop_y + crop_x; const uint8_t* src_u; const uint8_t* src_v; src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; r = H444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } case FOURCC_U444: { const uint8_t* src_y = sample + src_width * crop_y + crop_x; const uint8_t* src_u; const uint8_t* src_v; src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; r = U444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width, dst_argb, dst_stride_argb, crop_width, inv_crop_height); break; } #ifdef HAVE_JPEG case FOURCC_MJPG: r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width, abs_src_height, crop_width, inv_crop_height); break; #endif default: r = -1; // unknown fourcc - return failure code. } if (need_buf) { if (!r) { r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb, crop_width, abs_crop_height, rotation); } free(rotate_buffer); } else if (rotation) { src = sample + (src_width * crop_y + crop_x) * 4; r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width, inv_crop_height, rotation); } return r; } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/convert_to_i420.cc000066400000000000000000000263321416500237200222630ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include "libyuv/convert.h" #include "libyuv/video_common.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Convert camera sample to I420 with cropping, rotation and vertical flip. // src_width is used for source stride computation // src_height is used to compute location of planes, and indicate inversion // sample_size is measured in bytes and is the size of the frame. // With MJPEG it is the compressed size of the frame. LIBYUV_API int ConvertToI420(const uint8_t* sample, size_t sample_size, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int crop_x, int crop_y, int src_width, int src_height, int crop_width, int crop_height, enum RotationMode rotation, uint32_t fourcc) { uint32_t format = CanonicalFourCC(fourcc); int aligned_src_width = (src_width + 1) & ~1; const uint8_t* src; const uint8_t* src_uv; const int abs_src_height = (src_height < 0) ? -src_height : src_height; // TODO(nisse): Why allow crop_height < 0? const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; int r = 0; LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 && format != FOURCC_NV12 && format != FOURCC_NV21 && format != FOURCC_YV12) || dst_y == sample; uint8_t* tmp_y = dst_y; uint8_t* tmp_u = dst_u; uint8_t* tmp_v = dst_v; int tmp_y_stride = dst_stride_y; int tmp_u_stride = dst_stride_u; int tmp_v_stride = dst_stride_v; uint8_t* rotate_buffer = NULL; const int inv_crop_height = (src_height < 0) ? -abs_crop_height : abs_crop_height; if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || crop_width <= 0 || src_height == 0 || crop_height == 0) { return -1; } // One pass rotation is available for some formats. For the rest, convert // to I420 (with optional vertical flipping) into a temporary I420 buffer, // and then rotate the I420 to the final destination buffer. // For in-place conversion, if destination dst_y is same as source sample, // also enable temporary buffer. if (need_buf) { int y_size = crop_width * abs_crop_height; int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */ if (!rotate_buffer) { return 1; // Out of memory runtime error. } dst_y = rotate_buffer; dst_u = dst_y + y_size; dst_v = dst_u + uv_size; dst_stride_y = crop_width; dst_stride_u = dst_stride_v = ((crop_width + 1) / 2); } switch (format) { // Single plane formats case FOURCC_YUY2: { // TODO(fbarchard): Find better odd crop fix. uint8_t* u = (crop_x & 1) ? dst_v : dst_u; uint8_t* v = (crop_x & 1) ? dst_u : dst_v; int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u; int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v; src = sample + (aligned_src_width * crop_y + crop_x) * 2; r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u, stride_u, v, stride_v, crop_width, inv_crop_height); break; } case FOURCC_UYVY: { uint8_t* u = (crop_x & 1) ? dst_v : dst_u; uint8_t* v = (crop_x & 1) ? dst_u : dst_v; int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u; int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v; src = sample + (aligned_src_width * crop_y + crop_x) * 2; r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u, stride_u, v, stride_v, crop_width, inv_crop_height); break; } case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; case FOURCC_RGBO: src = sample + (src_width * crop_y + crop_x) * 2; r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; case FOURCC_R444: src = sample + (src_width * crop_y + crop_x) * 2; r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; case FOURCC_24BG: src = sample + (src_width * crop_y + crop_x) * 3; r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; case FOURCC_RAW: src = sample + (src_width * crop_y + crop_x) * 3; r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; case FOURCC_ARGB: src = sample + (src_width * crop_y + crop_x) * 4; r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; case FOURCC_BGRA: src = sample + (src_width * crop_y + crop_x) * 4; r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; case FOURCC_ABGR: src = sample + (src_width * crop_y + crop_x) * 4; r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; case FOURCC_RGBA: src = sample + (src_width * crop_y + crop_x) * 4; r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; // TODO(fbarchard): Add AR30 and AB30 case FOURCC_I400: src = sample + src_width * crop_y + crop_x; r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; // Biplanar formats case FOURCC_NV12: src = sample + (src_width * crop_y + crop_x); src_uv = sample + (src_width * abs_src_height) + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height, rotation); break; case FOURCC_NV21: src = sample + (src_width * crop_y + crop_x); src_uv = sample + (src_width * abs_src_height) + ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2); // Call NV12 but with dst_u and dst_v parameters swapped. r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u, crop_width, inv_crop_height, rotation); break; // Triplanar formats case FOURCC_I420: case FOURCC_YV12: { const uint8_t* src_y = sample + (src_width * crop_y + crop_x); const uint8_t* src_u; const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; int halfheight = (abs_src_height + 1) / 2; if (format == FOURCC_YV12) { src_v = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) + (crop_x / 2); src_u = sample + src_width * abs_src_height + halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2); } else { src_u = sample + src_width * abs_src_height + halfwidth * (crop_y / 2) + (crop_x / 2); src_v = sample + src_width * abs_src_height + halfwidth * (halfheight + (crop_y / 2)) + (crop_x / 2); } r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height, rotation); break; } case FOURCC_I422: case FOURCC_YV16: { const uint8_t* src_y = sample + src_width * crop_y + crop_x; const uint8_t* src_u; const uint8_t* src_v; int halfwidth = (src_width + 1) / 2; if (format == FOURCC_YV16) { src_v = sample + src_width * abs_src_height + halfwidth * crop_y + (crop_x / 2); src_u = sample + src_width * abs_src_height + halfwidth * (abs_src_height + crop_y) + (crop_x / 2); } else { src_u = sample + src_width * abs_src_height + halfwidth * crop_y + (crop_x / 2); src_v = sample + src_width * abs_src_height + halfwidth * (abs_src_height + crop_y) + (crop_x / 2); } r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; } case FOURCC_I444: case FOURCC_YV24: { const uint8_t* src_y = sample + src_width * crop_y + crop_x; const uint8_t* src_u; const uint8_t* src_v; if (format == FOURCC_YV24) { src_v = sample + src_width * (abs_src_height + crop_y) + crop_x; src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } else { src_u = sample + src_width * (abs_src_height + crop_y) + crop_x; src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x; } r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, crop_width, inv_crop_height); break; } #ifdef HAVE_JPEG case FOURCC_MJPG: r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, src_width, abs_src_height, crop_width, inv_crop_height); break; #endif default: r = -1; // unknown fourcc - return failure code. } if (need_buf) { if (!r) { r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride, tmp_v, tmp_v_stride, crop_width, abs_crop_height, rotation); } free(rotate_buffer); } return r; } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/cpu_id.cc000066400000000000000000000221431416500237200206020ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/cpu_id.h" #if defined(_MSC_VER) #include // For __cpuidex() #endif #if !defined(__pnacl__) && !defined(__CLR_VER) && \ !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) #include // For _xgetbv() #endif // For ArmCpuCaps() but unittested on all platforms #include #include #ifdef __cplusplus namespace libyuv { extern "C" { #endif // For functions that use the stack and have runtime checks for overflow, // use SAFEBUFFERS to avoid additional check. #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) && \ !defined(__clang__) #define SAFEBUFFERS __declspec(safebuffers) #else #define SAFEBUFFERS #endif // cpu_info_ variable for SIMD instruction sets detected. LIBYUV_API int cpu_info_ = 0; // TODO(fbarchard): Consider using int for cpuid so casting is not needed. // Low level cpuid for X86. #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ defined(__x86_64__)) && \ !defined(__pnacl__) && !defined(__CLR_VER) LIBYUV_API void CpuId(int info_eax, int info_ecx, int* cpu_info) { #if defined(_MSC_VER) // Visual C version uses intrinsic or inline x86 assembly. #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) __cpuidex(cpu_info, info_eax, info_ecx); #elif defined(_M_IX86) __asm { mov eax, info_eax mov ecx, info_ecx mov edi, cpu_info cpuid mov [edi], eax mov [edi + 4], ebx mov [edi + 8], ecx mov [edi + 12], edx } #else // Visual C but not x86 if (info_ecx == 0) { __cpuid(cpu_info, info_eax); } else { cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u; } #endif // GCC version uses inline x86 assembly. #else // defined(_MSC_VER) int info_ebx, info_edx; asm volatile( #if defined(__i386__) && defined(__PIC__) // Preserve ebx for fpic 32 bit. "mov %%ebx, %%edi \n" "cpuid \n" "xchg %%edi, %%ebx \n" : "=D"(info_ebx), #else "cpuid \n" : "=b"(info_ebx), #endif // defined( __i386__) && defined(__PIC__) "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx)); cpu_info[0] = info_eax; cpu_info[1] = info_ebx; cpu_info[2] = info_ecx; cpu_info[3] = info_edx; #endif // defined(_MSC_VER) } #else // (defined(_M_IX86) || defined(_M_X64) ... LIBYUV_API void CpuId(int eax, int ecx, int* cpu_info) { (void)eax; (void)ecx; cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; } #endif // For VS2010 and earlier emit can be used: // _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. // __asm { // xor ecx, ecx // xcr 0 // xgetbv // mov xcr0, eax // } // For VS2013 and earlier 32 bit, the _xgetbv(0) optimizer produces bad code. // https://code.google.com/p/libyuv/issues/detail?id=529 #if defined(_M_IX86) && (_MSC_VER < 1900) #pragma optimize("g", off) #endif #if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ defined(__x86_64__)) && \ !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. int GetXCR0() { int xcr0 = 0; #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219) xcr0 = (int)_xgetbv(0); // VS2010 SP1 required. NOLINT #elif defined(__i386__) || defined(__x86_64__) asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx"); #endif // defined(__i386__) || defined(__x86_64__) return xcr0; } #else // xgetbv unavailable to query for OSSave support. Return 0. #define GetXCR0() 0 #endif // defined(_M_IX86) || defined(_M_X64) .. // Return optimization to previous setting. #if defined(_M_IX86) && (_MSC_VER < 1900) #pragma optimize("g", on) #endif // Based on libvpx arm_cpudetect.c // For Arm, but public to allow testing on any CPU LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; FILE* f = fopen(cpuinfo_name, "r"); if (!f) { // Assume Neon if /proc/cpuinfo is unavailable. // This will occur for Chrome sandbox for Pepper or Render process. return kCpuHasNEON; } while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { if (memcmp(cpuinfo_line, "Features", 8) == 0) { char* p = strstr(cpuinfo_line, " neon"); if (p && (p[5] == ' ' || p[5] == '\n')) { fclose(f); return kCpuHasNEON; } // aarch64 uses asimd for Neon. p = strstr(cpuinfo_line, " asimd"); if (p) { fclose(f); return kCpuHasNEON; } } } fclose(f); return 0; } // TODO(fbarchard): Consider read_msa_ir(). LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name) { char cpuinfo_line[512]; int flag = 0x0; FILE* f = fopen(cpuinfo_name, "r"); if (!f) { // Assume nothing if /proc/cpuinfo is unavailable. // This will occur for Chrome sandbox for Pepper or Render process. return 0; } while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) { if (memcmp(cpuinfo_line, "cpu model", 9) == 0) { // Workaround early kernel without mmi in ASEs line. if (strstr(cpuinfo_line, "Loongson-3")) { flag |= kCpuHasMMI; } else if (strstr(cpuinfo_line, "Loongson-2K")) { flag |= kCpuHasMMI | kCpuHasMSA; } } if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) { if (strstr(cpuinfo_line, "loongson-mmi") && strstr(cpuinfo_line, "loongson-ext")) { flag |= kCpuHasMMI; } if (strstr(cpuinfo_line, "msa")) { flag |= kCpuHasMSA; } // ASEs is the last line, so we can break here. break; } } fclose(f); return flag; } static SAFEBUFFERS int GetCpuFlags(void) { int cpu_info = 0; #if !defined(__pnacl__) && !defined(__CLR_VER) && \ (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ defined(_M_IX86)) int cpu_info0[4] = {0, 0, 0, 0}; int cpu_info1[4] = {0, 0, 0, 0}; int cpu_info7[4] = {0, 0, 0, 0}; CpuId(0, 0, cpu_info0); CpuId(1, 0, cpu_info1); if (cpu_info0[0] >= 7) { CpuId(7, 0, cpu_info7); } cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) | ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) | ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) | ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) | ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0); // AVX requires OS saves YMM registers. if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) && // AVX and OSXSave ((GetXCR0() & 6) == 6)) { // Test OS saves YMM registers cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) | ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0); // Detect AVX512bw if ((GetXCR0() & 0xe0) == 0xe0) { cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0; cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0; cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0; cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0; cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0; cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0; cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0; } } #endif #if defined(__mips__) && defined(__linux__) cpu_info = MipsCpuCaps("/proc/cpuinfo"); cpu_info |= kCpuHasMIPS; #endif #if defined(__arm__) || defined(__aarch64__) // gcc -mfpu=neon defines __ARM_NEON__ // __ARM_NEON__ generates code that requires Neon. NaCL also requires Neon. // For Linux, /proc/cpuinfo can be tested but without that assume Neon. #if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__) cpu_info = kCpuHasNEON; // For aarch64(arm64), /proc/cpuinfo's feature is not complete, e.g. no neon // flag in it. // So for aarch64, neon enabling is hard coded here. #endif #if defined(__aarch64__) cpu_info = kCpuHasNEON; #else // Linux arm parse text file for neon detect. cpu_info = ArmCpuCaps("/proc/cpuinfo"); #endif cpu_info |= kCpuHasARM; #endif // __arm__ cpu_info |= kCpuInitialized; return cpu_info; } // Note that use of this function is not thread safe. LIBYUV_API int MaskCpuFlags(int enable_flags) { int cpu_info = GetCpuFlags() & enable_flags; SetCpuFlags(cpu_info); return cpu_info; } LIBYUV_API int InitCpuFlags(void) { return MaskCpuFlags(-1); } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/mjpeg_decoder.cc000066400000000000000000000452321416500237200221320ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/mjpeg_decoder.h" #ifdef HAVE_JPEG #include #if !defined(__pnacl__) && !defined(__CLR_VER) && \ !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) // Must be included before jpeglib. #include #define HAVE_SETJMP #if defined(_MSC_VER) // disable warning 4324: structure was padded due to __declspec(align()) #pragma warning(disable : 4324) #endif #endif #include // For jpeglib.h. // C++ build requires extern C for jpeg internals. #ifdef __cplusplus extern "C" { #endif #include #ifdef __cplusplus } // extern "C" #endif #include "libyuv/planar_functions.h" // For CopyPlane(). namespace libyuv { #ifdef HAVE_SETJMP struct SetJmpErrorMgr { jpeg_error_mgr base; // Must be at the top jmp_buf setjmp_buffer; }; #endif const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN; const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE; const int MJpegDecoder::kColorSpaceRgb = JCS_RGB; const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr; const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK; const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK; // Methods that are passed to jpeglib. boolean fill_input_buffer(jpeg_decompress_struct* cinfo); void init_source(jpeg_decompress_struct* cinfo); void skip_input_data(jpeg_decompress_struct* cinfo, long num_bytes); // NOLINT void term_source(jpeg_decompress_struct* cinfo); void ErrorHandler(jpeg_common_struct* cinfo); void OutputHandler(jpeg_common_struct* cinfo); MJpegDecoder::MJpegDecoder() : has_scanline_padding_(LIBYUV_FALSE), num_outbufs_(0), scanlines_(NULL), scanlines_sizes_(NULL), databuf_(NULL), databuf_strides_(NULL) { decompress_struct_ = new jpeg_decompress_struct; source_mgr_ = new jpeg_source_mgr; #ifdef HAVE_SETJMP error_mgr_ = new SetJmpErrorMgr; decompress_struct_->err = jpeg_std_error(&error_mgr_->base); // Override standard exit()-based error handler. error_mgr_->base.error_exit = &ErrorHandler; error_mgr_->base.output_message = &OutputHandler; #endif decompress_struct_->client_data = NULL; source_mgr_->init_source = &init_source; source_mgr_->fill_input_buffer = &fill_input_buffer; source_mgr_->skip_input_data = &skip_input_data; source_mgr_->resync_to_restart = &jpeg_resync_to_restart; source_mgr_->term_source = &term_source; jpeg_create_decompress(decompress_struct_); decompress_struct_->src = source_mgr_; buf_vec_.buffers = &buf_; buf_vec_.len = 1; } MJpegDecoder::~MJpegDecoder() { jpeg_destroy_decompress(decompress_struct_); delete decompress_struct_; delete source_mgr_; #ifdef HAVE_SETJMP delete error_mgr_; #endif DestroyOutputBuffers(); } LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) { if (!ValidateJpeg(src, src_len)) { return LIBYUV_FALSE; } buf_.data = src; buf_.len = static_cast(src_len); buf_vec_.pos = 0; decompress_struct_->client_data = &buf_vec_; #ifdef HAVE_SETJMP if (setjmp(error_mgr_->setjmp_buffer)) { // We called jpeg_read_header, it experienced an error, and we called // longjmp() and rewound the stack to here. Return error. return LIBYUV_FALSE; } #endif if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) { // ERROR: Bad MJPEG header return LIBYUV_FALSE; } AllocOutputBuffers(GetNumComponents()); for (int i = 0; i < num_outbufs_; ++i) { int scanlines_size = GetComponentScanlinesPerImcuRow(i); if (scanlines_sizes_[i] != scanlines_size) { if (scanlines_[i]) { delete scanlines_[i]; } scanlines_[i] = new uint8_t*[scanlines_size]; scanlines_sizes_[i] = scanlines_size; } // We allocate padding for the final scanline to pad it up to DCTSIZE bytes // to avoid memory errors, since jpeglib only reads full MCUs blocks. For // the preceding scanlines, the padding is not needed/wanted because the // following addresses will already be valid (they are the initial bytes of // the next scanline) and will be overwritten when jpeglib writes out that // next scanline. int databuf_stride = GetComponentStride(i); int databuf_size = scanlines_size * databuf_stride; if (databuf_strides_[i] != databuf_stride) { if (databuf_[i]) { delete databuf_[i]; } databuf_[i] = new uint8_t[databuf_size]; databuf_strides_[i] = databuf_stride; } if (GetComponentStride(i) != GetComponentWidth(i)) { has_scanline_padding_ = LIBYUV_TRUE; } } return LIBYUV_TRUE; } static int DivideAndRoundUp(int numerator, int denominator) { return (numerator + denominator - 1) / denominator; } static int DivideAndRoundDown(int numerator, int denominator) { return numerator / denominator; } // Returns width of the last loaded frame. int MJpegDecoder::GetWidth() { return decompress_struct_->image_width; } // Returns height of the last loaded frame. int MJpegDecoder::GetHeight() { return decompress_struct_->image_height; } // Returns format of the last loaded frame. The return value is one of the // kColorSpace* constants. int MJpegDecoder::GetColorSpace() { return decompress_struct_->jpeg_color_space; } // Number of color components in the color space. int MJpegDecoder::GetNumComponents() { return decompress_struct_->num_components; } // Sample factors of the n-th component. int MJpegDecoder::GetHorizSampFactor(int component) { return decompress_struct_->comp_info[component].h_samp_factor; } int MJpegDecoder::GetVertSampFactor(int component) { return decompress_struct_->comp_info[component].v_samp_factor; } int MJpegDecoder::GetHorizSubSampFactor(int component) { return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component); } int MJpegDecoder::GetVertSubSampFactor(int component) { return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component); } int MJpegDecoder::GetImageScanlinesPerImcuRow() { return decompress_struct_->max_v_samp_factor * DCTSIZE; } int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) { int vs = GetVertSubSampFactor(component); return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs); } int MJpegDecoder::GetComponentWidth(int component) { int hs = GetHorizSubSampFactor(component); return DivideAndRoundUp(GetWidth(), hs); } int MJpegDecoder::GetComponentHeight(int component) { int vs = GetVertSubSampFactor(component); return DivideAndRoundUp(GetHeight(), vs); } // Get width in bytes padded out to a multiple of DCTSIZE int MJpegDecoder::GetComponentStride(int component) { return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1); } int MJpegDecoder::GetComponentSize(int component) { return GetComponentWidth(component) * GetComponentHeight(component); } LIBYUV_BOOL MJpegDecoder::UnloadFrame() { #ifdef HAVE_SETJMP if (setjmp(error_mgr_->setjmp_buffer)) { // We called jpeg_abort_decompress, it experienced an error, and we called // longjmp() and rewound the stack to here. Return error. return LIBYUV_FALSE; } #endif jpeg_abort_decompress(decompress_struct_); return LIBYUV_TRUE; } // TODO(fbarchard): Allow rectangle to be specified: x, y, width, height. LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height) { if (dst_width != GetWidth() || dst_height > GetHeight()) { // ERROR: Bad dimensions return LIBYUV_FALSE; } #ifdef HAVE_SETJMP if (setjmp(error_mgr_->setjmp_buffer)) { // We called into jpeglib, it experienced an error sometime during this // function call, and we called longjmp() and rewound the stack to here. // Return error. return LIBYUV_FALSE; } #endif if (!StartDecode()) { return LIBYUV_FALSE; } SetScanlinePointers(databuf_); int lines_left = dst_height; // Compute amount of lines to skip to implement vertical crop. // TODO(fbarchard): Ensure skip is a multiple of maximum component // subsample. ie 2 int skip = (GetHeight() - dst_height) / 2; if (skip > 0) { // There is no API to skip lines in the output data, so we read them // into the temp buffer. while (skip >= GetImageScanlinesPerImcuRow()) { if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; } skip -= GetImageScanlinesPerImcuRow(); } if (skip > 0) { // Have a partial iMCU row left over to skip. Must read it and then // copy the parts we want into the destination. if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; } for (int i = 0; i < num_outbufs_; ++i) { // TODO(fbarchard): Compute skip to avoid this assert(skip % GetVertSubSampFactor(i) == 0); int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) - rows_to_skip; int data_to_skip = rows_to_skip * GetComponentStride(i); CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i], GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } lines_left -= (GetImageScanlinesPerImcuRow() - skip); } } // Read full MCUs but cropped horizontally for (; lines_left > GetImageScanlinesPerImcuRow(); lines_left -= GetImageScanlinesPerImcuRow()) { if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; } for (int i = 0; i < num_outbufs_; ++i) { int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i); CopyPlane(databuf_[i], GetComponentStride(i), planes[i], GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } } if (lines_left > 0) { // Have a partial iMCU row left over to decode. if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; } for (int i = 0; i < num_outbufs_; ++i) { int scanlines_to_copy = DivideAndRoundUp(lines_left, GetVertSubSampFactor(i)); CopyPlane(databuf_[i], GetComponentStride(i), planes[i], GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy); planes[i] += scanlines_to_copy * GetComponentWidth(i); } } return FinishDecode(); } LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque, int dst_width, int dst_height) { if (dst_width != GetWidth() || dst_height > GetHeight()) { // ERROR: Bad dimensions return LIBYUV_FALSE; } #ifdef HAVE_SETJMP if (setjmp(error_mgr_->setjmp_buffer)) { // We called into jpeglib, it experienced an error sometime during this // function call, and we called longjmp() and rewound the stack to here. // Return error. return LIBYUV_FALSE; } #endif if (!StartDecode()) { return LIBYUV_FALSE; } SetScanlinePointers(databuf_); int lines_left = dst_height; // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop int skip = (GetHeight() - dst_height) / 2; if (skip > 0) { while (skip >= GetImageScanlinesPerImcuRow()) { if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; } skip -= GetImageScanlinesPerImcuRow(); } if (skip > 0) { // Have a partial iMCU row left over to skip. if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; } for (int i = 0; i < num_outbufs_; ++i) { // TODO(fbarchard): Compute skip to avoid this assert(skip % GetVertSubSampFactor(i) == 0); int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); int data_to_skip = rows_to_skip * GetComponentStride(i); // Change our own data buffer pointers so we can pass them to the // callback. databuf_[i] += data_to_skip; } int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip; (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy); // Now change them back. for (int i = 0; i < num_outbufs_; ++i) { int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i)); int data_to_skip = rows_to_skip * GetComponentStride(i); databuf_[i] -= data_to_skip; } lines_left -= scanlines_to_copy; } } // Read full MCUs until we get to the crop point. for (; lines_left >= GetImageScanlinesPerImcuRow(); lines_left -= GetImageScanlinesPerImcuRow()) { if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; } (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow()); } if (lines_left > 0) { // Have a partial iMCU row left over to decode. if (!DecodeImcuRow()) { FinishDecode(); return LIBYUV_FALSE; } (*fn)(opaque, databuf_, databuf_strides_, lines_left); } return FinishDecode(); } void init_source(j_decompress_ptr cinfo) { fill_input_buffer(cinfo); } boolean fill_input_buffer(j_decompress_ptr cinfo) { BufferVector* buf_vec = reinterpret_cast(cinfo->client_data); if (buf_vec->pos >= buf_vec->len) { // ERROR: No more data return FALSE; } cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data; cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len; ++buf_vec->pos; return TRUE; } void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT jpeg_source_mgr* src = cinfo->src; size_t bytes = static_cast(num_bytes); if (bytes > src->bytes_in_buffer) { src->next_input_byte = nullptr; src->bytes_in_buffer = 0; } else { src->next_input_byte += bytes; src->bytes_in_buffer -= bytes; } } void term_source(j_decompress_ptr cinfo) { (void)cinfo; // Nothing to do. } #ifdef HAVE_SETJMP void ErrorHandler(j_common_ptr cinfo) { // This is called when a jpeglib command experiences an error. Unfortunately // jpeglib's error handling model is not very flexible, because it expects the // error handler to not return--i.e., it wants the program to terminate. To // recover from errors we use setjmp() as shown in their example. setjmp() is // C's implementation for the "call with current continuation" functionality // seen in some functional programming languages. // A formatted message can be output, but is unsafe for release. #ifdef DEBUG char buf[JMSG_LENGTH_MAX]; (*cinfo->err->format_message)(cinfo, buf); // ERROR: Error in jpeglib: buf #endif SetJmpErrorMgr* mgr = reinterpret_cast(cinfo->err); // This rewinds the call stack to the point of the corresponding setjmp() // and causes it to return (for a second time) with value 1. longjmp(mgr->setjmp_buffer, 1); } // Suppress fprintf warnings. void OutputHandler(j_common_ptr cinfo) { (void)cinfo; } #endif // HAVE_SETJMP void MJpegDecoder::AllocOutputBuffers(int num_outbufs) { if (num_outbufs != num_outbufs_) { // We could perhaps optimize this case to resize the output buffers without // necessarily having to delete and recreate each one, but it's not worth // it. DestroyOutputBuffers(); scanlines_ = new uint8_t**[num_outbufs]; scanlines_sizes_ = new int[num_outbufs]; databuf_ = new uint8_t*[num_outbufs]; databuf_strides_ = new int[num_outbufs]; for (int i = 0; i < num_outbufs; ++i) { scanlines_[i] = NULL; scanlines_sizes_[i] = 0; databuf_[i] = NULL; databuf_strides_[i] = 0; } num_outbufs_ = num_outbufs; } } void MJpegDecoder::DestroyOutputBuffers() { for (int i = 0; i < num_outbufs_; ++i) { delete[] scanlines_[i]; delete[] databuf_[i]; } delete[] scanlines_; delete[] databuf_; delete[] scanlines_sizes_; delete[] databuf_strides_; scanlines_ = NULL; databuf_ = NULL; scanlines_sizes_ = NULL; databuf_strides_ = NULL; num_outbufs_ = 0; } // JDCT_IFAST and do_block_smoothing improve performance substantially. LIBYUV_BOOL MJpegDecoder::StartDecode() { decompress_struct_->raw_data_out = TRUE; decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default decompress_struct_->dither_mode = JDITHER_NONE; // Not applicable to 'raw': decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE); // Only for buffered mode: decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE); // Blocky but fast: decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE); if (!jpeg_start_decompress(decompress_struct_)) { // ERROR: Couldn't start JPEG decompressor"; return LIBYUV_FALSE; } return LIBYUV_TRUE; } LIBYUV_BOOL MJpegDecoder::FinishDecode() { // jpeglib considers it an error if we finish without decoding the whole // image, so we call "abort" rather than "finish". jpeg_abort_decompress(decompress_struct_); return LIBYUV_TRUE; } void MJpegDecoder::SetScanlinePointers(uint8_t** data) { for (int i = 0; i < num_outbufs_; ++i) { uint8_t* data_i = data[i]; for (int j = 0; j < scanlines_sizes_[i]; ++j) { scanlines_[i][j] = data_i; data_i += GetComponentStride(i); } } } inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() { return (unsigned int)(GetImageScanlinesPerImcuRow()) == jpeg_read_raw_data(decompress_struct_, scanlines_, GetImageScanlinesPerImcuRow()); } // The helper function which recognizes the jpeg sub-sampling type. JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper( int* subsample_x, int* subsample_y, int number_of_components) { if (number_of_components == 3) { // Color images. if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) { return kJpegYuv420; } if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 && subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) { return kJpegYuv422; } if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 && subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) { return kJpegYuv444; } } else if (number_of_components == 1) { // Grey-scale images. if (subsample_x[0] == 1 && subsample_y[0] == 1) { return kJpegYuv400; } } return kJpegUnknown; } } // namespace libyuv #endif // HAVE_JPEG libyuv-0.0~git20220104.b91df1a/source/mjpeg_validate.cc000066400000000000000000000043741416500237200223200ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/mjpeg_decoder.h" #include // For memchr. #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Helper function to scan for EOI marker (0xff 0xd9). static LIBYUV_BOOL ScanEOI(const uint8_t* src_mjpg, size_t src_size_mjpg) { if (src_size_mjpg >= 2) { const uint8_t* end = src_mjpg + src_size_mjpg - 1; const uint8_t* it = src_mjpg; while (it < end) { // TODO(fbarchard): scan for 0xd9 instead. it = (const uint8_t*)(memchr(it, 0xff, end - it)); if (it == NULL) { break; } if (it[1] == 0xd9) { return LIBYUV_TRUE; // Success: Valid jpeg. } ++it; // Skip over current 0xff. } } // ERROR: Invalid jpeg end code not found. Size src_size_mjpg return LIBYUV_FALSE; } // Helper function to validate the jpeg appears intact. LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) { // Maximum size that ValidateJpeg will consider valid. const size_t kMaxJpegSize = 0x7fffffffull; const size_t kBackSearchSize = 1024; if (src_size_mjpg < 64 || src_size_mjpg > kMaxJpegSize || !src_mjpg) { // ERROR: Invalid jpeg size: src_size_mjpg return LIBYUV_FALSE; } // SOI marker if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) { // ERROR: Invalid jpeg initial start code return LIBYUV_FALSE; } // Look for the End Of Image (EOI) marker near the end of the buffer. if (src_size_mjpg > kBackSearchSize) { if (ScanEOI(src_mjpg + src_size_mjpg - kBackSearchSize, kBackSearchSize)) { return LIBYUV_TRUE; // Success: Valid jpeg. } // Reduce search size for forward search. src_size_mjpg = src_size_mjpg - kBackSearchSize + 1; } // Step over SOI marker and scan for EOI. return ScanEOI(src_mjpg + 2, src_size_mjpg - 2); } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/planar_functions.cc000066400000000000000000004423761416500237200227220ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/planar_functions.h" #include #include // for memset() #include "libyuv/cpu_id.h" #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" #endif #include "libyuv/row.h" #include "libyuv/scale_row.h" // for ScaleRowDown2 #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Copy a plane of data LIBYUV_API void CopyPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int width, int height) { int y; void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; // Negative height means invert the image. if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; dst_stride_y = -dst_stride_y; } // Coalesce rows. if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; } // Nothing to do. if (src_y == dst_y && src_stride_y == dst_stride_y) { return; } #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } #endif #if defined(HAS_COPYROW_AVX) if (TestCpuFlag(kCpuHasAVX)) { CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; } #endif #if defined(HAS_COPYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif // Copy plane for (y = 0; y < height; ++y) { CopyRow(src_y, dst_y, width); src_y += src_stride_y; dst_y += dst_stride_y; } } // TODO(fbarchard): Consider support for negative height. // TODO(fbarchard): Consider stride measured in bytes. LIBYUV_API void CopyPlane_16(const uint16_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, int width, int height) { int y; void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C; // Coalesce rows. if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; } #if defined(HAS_COPYROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_16_SSE2; } #endif #if defined(HAS_COPYROW_16_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_16_ERMS; } #endif #if defined(HAS_COPYROW_16_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { CopyRow = CopyRow_16_NEON; } #endif // Copy plane for (y = 0; y < height; ++y) { CopyRow(src_y, dst_y, width); src_y += src_stride_y; dst_y += dst_stride_y; } } // Convert a plane of 16 bit data to 8 bit LIBYUV_API void Convert16To8Plane(const uint16_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int scale, // 16384 for 10 bits int width, int height) { int y; void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) = Convert16To8Row_C; // Negative height means invert the image. if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; dst_stride_y = -dst_stride_y; } // Coalesce rows. if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; } #if defined(HAS_CONVERT16TO8ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { Convert16To8Row = Convert16To8Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { Convert16To8Row = Convert16To8Row_SSSE3; } } #endif #if defined(HAS_CONVERT16TO8ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { Convert16To8Row = Convert16To8Row_Any_AVX2; if (IS_ALIGNED(width, 32)) { Convert16To8Row = Convert16To8Row_AVX2; } } #endif // Convert plane for (y = 0; y < height; ++y) { Convert16To8Row(src_y, dst_y, scale, width); src_y += src_stride_y; dst_y += dst_stride_y; } } // Convert a plane of 8 bit data to 16 bit LIBYUV_API void Convert8To16Plane(const uint8_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, int scale, // 16384 for 10 bits int width, int height) { int y; void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) = Convert8To16Row_C; // Negative height means invert the image. if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; dst_stride_y = -dst_stride_y; } // Coalesce rows. if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; } #if defined(HAS_CONVERT8TO16ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { Convert8To16Row = Convert8To16Row_Any_SSE2; if (IS_ALIGNED(width, 16)) { Convert8To16Row = Convert8To16Row_SSE2; } } #endif #if defined(HAS_CONVERT8TO16ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { Convert8To16Row = Convert8To16Row_Any_AVX2; if (IS_ALIGNED(width, 32)) { Convert8To16Row = Convert8To16Row_AVX2; } } #endif // Convert plane for (y = 0; y < height; ++y) { Convert8To16Row(src_y, dst_y, scale, width); src_y += src_stride_y; dst_y += dst_stride_y; } } // Copy I422. LIBYUV_API int I422Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int halfwidth = (width + 1) >> 1; if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (height - 1) * src_stride_u; src_v = src_v + (height - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height); CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height); return 0; } // Copy I444. LIBYUV_API int I444Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (height - 1) * src_stride_u; src_v = src_v + (height - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); return 0; } // Copy I400. LIBYUV_API int I400ToI400(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int width, int height) { if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); return 0; } // Convert I420 to I400. LIBYUV_API int I420ToI400(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, int width, int height) { (void)src_u; (void)src_stride_u; (void)src_v; (void)src_stride_v; if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); return 0; } // Copy NV12. Supports inverting. int NV12Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { if (!src_y || !dst_y || !src_uv || !dst_uv || width <= 0 || height == 0) { return -1; } int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_uv = src_uv + (halfheight - 1) * src_stride_uv; src_stride_y = -src_stride_y; src_stride_uv = -src_stride_uv; } CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth * 2, halfheight); return 0; } // Copy NV21. Supports inverting. int NV21Copy(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height) { return NV12Copy(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, dst_stride_y, dst_vu, dst_stride_vu, width, height); } // Support function for NV12 etc UV channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API void SplitUVPlane(const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; // Negative height means invert the image. if (height < 0) { height = -height; dst_u = dst_u + (height - 1) * dst_stride_u; dst_v = dst_v + (height - 1) * dst_stride_v; dst_stride_u = -dst_stride_u; dst_stride_v = -dst_stride_v; } // Coalesce rows. if (src_stride_uv == width * 2 && dst_stride_u == width && dst_stride_v == width) { width *= height; height = 1; src_stride_uv = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_SPLITUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SplitUVRow = SplitUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { SplitUVRow = SplitUVRow_SSE2; } } #endif #if defined(HAS_SPLITUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { SplitUVRow = SplitUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { SplitUVRow = SplitUVRow_AVX2; } } #endif #if defined(HAS_SPLITUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SplitUVRow = SplitUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { SplitUVRow = SplitUVRow_NEON; } } #endif #if defined(HAS_SPLITUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SplitUVRow = SplitUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { SplitUVRow = SplitUVRow_MMI; } } #endif #if defined(HAS_SPLITUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { SplitUVRow = SplitUVRow_Any_MSA; if (IS_ALIGNED(width, 32)) { SplitUVRow = SplitUVRow_MSA; } } #endif for (y = 0; y < height; ++y) { // Copy a row of UV. SplitUVRow(src_uv, dst_u, dst_v, width); dst_u += dst_stride_u; dst_v += dst_stride_v; src_uv += src_stride_uv; } } LIBYUV_API void MergeUVPlane(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) = MergeUVRow_C; // Negative height means invert the image. if (height < 0) { height = -height; dst_uv = dst_uv + (height - 1) * dst_stride_uv; dst_stride_uv = -dst_stride_uv; } // Coalesce rows. if (src_stride_u == width && src_stride_v == width && dst_stride_uv == width * 2) { width *= height; height = 1; src_stride_u = src_stride_v = dst_stride_uv = 0; } #if defined(HAS_MERGEUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeUVRow = MergeUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { MergeUVRow = MergeUVRow_SSE2; } } #endif #if defined(HAS_MERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow = MergeUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { MergeUVRow = MergeUVRow_AVX2; } } #endif #if defined(HAS_MERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow = MergeUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { MergeUVRow = MergeUVRow_NEON; } } #endif #if defined(HAS_MERGEUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeUVRow = MergeUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { MergeUVRow = MergeUVRow_MMI; } } #endif #if defined(HAS_MERGEUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MergeUVRow = MergeUVRow_Any_MSA; if (IS_ALIGNED(width, 16)) { MergeUVRow = MergeUVRow_MSA; } } #endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. MergeUVRow(src_u, src_v, dst_uv, width); src_u += src_stride_u; src_v += src_stride_v; dst_uv += dst_stride_uv; } } // Support function for P010 etc UV channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API void SplitUVPlane_16(const uint16_t* src_uv, int src_stride_uv, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int width, int height, int depth) { int y; void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width) = SplitUVRow_16_C; // Negative height means invert the image. if (height < 0) { height = -height; dst_u = dst_u + (height - 1) * dst_stride_u; dst_v = dst_v + (height - 1) * dst_stride_v; dst_stride_u = -dst_stride_u; dst_stride_v = -dst_stride_v; } // Coalesce rows. if (src_stride_uv == width * 2 && dst_stride_u == width && dst_stride_v == width) { width *= height; height = 1; src_stride_uv = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_SPLITUVROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { SplitUVRow_16 = SplitUVRow_16_Any_AVX2; if (IS_ALIGNED(width, 16)) { SplitUVRow_16 = SplitUVRow_16_AVX2; } } #endif #if defined(HAS_SPLITUVROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { SplitUVRow_16 = SplitUVRow_16_Any_NEON; if (IS_ALIGNED(width, 8)) { SplitUVRow_16 = SplitUVRow_16_NEON; } } #endif for (y = 0; y < height; ++y) { // Copy a row of UV. SplitUVRow_16(src_uv, dst_u, dst_v, depth, width); dst_u += dst_stride_u; dst_v += dst_stride_v; src_uv += src_stride_uv; } } LIBYUV_API void MergeUVPlane_16(const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, uint16_t* dst_uv, int dst_stride_uv, int width, int height, int depth) { int y; void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, int depth, int width) = MergeUVRow_16_C; assert(depth >= 8); assert(depth <= 16); // Negative height means invert the image. if (height < 0) { height = -height; dst_uv = dst_uv + (height - 1) * dst_stride_uv; dst_stride_uv = -dst_stride_uv; } // Coalesce rows. if (src_stride_u == width && src_stride_v == width && dst_stride_uv == width * 2) { width *= height; height = 1; src_stride_u = src_stride_v = dst_stride_uv = 0; } #if defined(HAS_MERGEUVROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeUVRow_16 = MergeUVRow_16_Any_AVX2; if (IS_ALIGNED(width, 16)) { MergeUVRow_16 = MergeUVRow_16_AVX2; } } #endif #if defined(HAS_MERGEUVROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeUVRow_16 = MergeUVRow_16_Any_NEON; if (IS_ALIGNED(width, 8)) { MergeUVRow_16 = MergeUVRow_16_NEON; } } #endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. MergeUVRow_16(src_u, src_v, dst_uv, depth, width); src_u += src_stride_u; src_v += src_stride_v; dst_uv += dst_stride_uv; } } // Convert plane from lsb to msb LIBYUV_API void ConvertToMSBPlane_16(const uint16_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, int width, int height, int depth) { int y; int scale = 1 << (16 - depth); void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) = MultiplyRow_16_C; // Negative height means invert the image. if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; dst_stride_y = -dst_stride_y; } // Coalesce rows. if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; } #if defined(HAS_MULTIPLYROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MultiplyRow_16 = MultiplyRow_16_Any_AVX2; if (IS_ALIGNED(width, 32)) { MultiplyRow_16 = MultiplyRow_16_AVX2; } } #endif #if defined(HAS_MULTIPLYROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { MultiplyRow_16 = MultiplyRow_16_Any_NEON; if (IS_ALIGNED(width, 16)) { MultiplyRow_16 = MultiplyRow_16_NEON; } } #endif for (y = 0; y < height; ++y) { MultiplyRow_16(src_y, dst_y, scale, width); src_y += src_stride_y; dst_y += dst_stride_y; } } // Convert plane from msb to lsb LIBYUV_API void ConvertToLSBPlane_16(const uint16_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, int width, int height, int depth) { int y; int scale = 1 << depth; void (*DivideRow)(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) = DivideRow_16_C; // Negative height means invert the image. if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; dst_stride_y = -dst_stride_y; } // Coalesce rows. if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; } #if defined(HAS_DIVIDEROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { DivideRow = DivideRow_16_Any_AVX2; if (IS_ALIGNED(width, 32)) { DivideRow = DivideRow_16_AVX2; } } #endif #if defined(HAS_DIVIDEROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { DivideRow = DivideRow_16_Any_NEON; if (IS_ALIGNED(width, 16)) { DivideRow = DivideRow_16_NEON; } } #endif for (y = 0; y < height; ++y) { DivideRow(src_y, dst_y, scale, width); src_y += src_stride_y; dst_y += dst_stride_y; } } // Swap U and V channels in interleaved UV plane. LIBYUV_API void SwapUVPlane(const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_vu, int dst_stride_vu, int width, int height) { int y; void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) = SwapUVRow_C; // Negative height means invert the image. if (height < 0) { height = -height; src_uv = src_uv + (height - 1) * src_stride_uv; src_stride_uv = -src_stride_uv; } // Coalesce rows. if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) { width *= height; height = 1; src_stride_uv = dst_stride_vu = 0; } #if defined(HAS_SWAPUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { SwapUVRow = SwapUVRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { SwapUVRow = SwapUVRow_SSSE3; } } #endif #if defined(HAS_SWAPUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { SwapUVRow = SwapUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { SwapUVRow = SwapUVRow_AVX2; } } #endif #if defined(HAS_SWAPUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SwapUVRow = SwapUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { SwapUVRow = SwapUVRow_NEON; } } #endif for (y = 0; y < height; ++y) { SwapUVRow(src_uv, dst_vu, width); src_uv += src_stride_uv; dst_vu += dst_stride_vu; } } // Convert NV21 to NV12. LIBYUV_API int NV21ToNV12(const uint8_t* src_y, int src_stride_y, const uint8_t* src_vu, int src_stride_vu, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_vu || !dst_uv || width <= 0 || height == 0) { return -1; } if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_vu = src_vu + (halfheight - 1) * src_stride_vu; src_stride_vu = -src_stride_vu; } SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth, halfheight); return 0; } // Support function for NV12 etc RGB channels. // Width and height are plane sizes (typically half pixel width). LIBYUV_API void SplitRGBPlane(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_r, int dst_stride_r, uint8_t* dst_g, int dst_stride_g, uint8_t* dst_b, int dst_stride_b, int width, int height) { int y; void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) = SplitRGBRow_C; // Negative height means invert the image. if (height < 0) { height = -height; dst_r = dst_r + (height - 1) * dst_stride_r; dst_g = dst_g + (height - 1) * dst_stride_g; dst_b = dst_b + (height - 1) * dst_stride_b; dst_stride_r = -dst_stride_r; dst_stride_g = -dst_stride_g; dst_stride_b = -dst_stride_b; } // Coalesce rows. if (src_stride_rgb == width * 3 && dst_stride_r == width && dst_stride_g == width && dst_stride_b == width) { width *= height; height = 1; src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0; } #if defined(HAS_SPLITRGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { SplitRGBRow = SplitRGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { SplitRGBRow = SplitRGBRow_SSSE3; } } #endif #if defined(HAS_SPLITRGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SplitRGBRow = SplitRGBRow_Any_MMI; if (IS_ALIGNED(width, 4)) { SplitRGBRow = SplitRGBRow_MMI; } } #endif #if defined(HAS_SPLITRGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SplitRGBRow = SplitRGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { SplitRGBRow = SplitRGBRow_NEON; } } #endif for (y = 0; y < height; ++y) { // Copy a row of RGB. SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width); dst_r += dst_stride_r; dst_g += dst_stride_g; dst_b += dst_stride_b; src_rgb += src_stride_rgb; } } LIBYUV_API void MergeRGBPlane(const uint8_t* src_r, int src_stride_r, const uint8_t* src_g, int src_stride_g, const uint8_t* src_b, int src_stride_b, uint8_t* dst_rgb, int dst_stride_rgb, int width, int height) { int y; void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width) = MergeRGBRow_C; // Coalesce rows. // Negative height means invert the image. if (height < 0) { height = -height; dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; dst_stride_rgb = -dst_stride_rgb; } // Coalesce rows. if (src_stride_r == width && src_stride_g == width && src_stride_b == width && dst_stride_rgb == width * 3) { width *= height; height = 1; src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0; } #if defined(HAS_MERGERGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { MergeRGBRow = MergeRGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { MergeRGBRow = MergeRGBRow_SSSE3; } } #endif #if defined(HAS_MERGERGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeRGBRow = MergeRGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { MergeRGBRow = MergeRGBRow_NEON; } } #endif #if defined(HAS_MERGERGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MergeRGBRow = MergeRGBRow_Any_MMI; if (IS_ALIGNED(width, 8)) { MergeRGBRow = MergeRGBRow_MMI; } } #endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of RGB. MergeRGBRow(src_r, src_g, src_b, dst_rgb, width); src_r += src_stride_r; src_g += src_stride_g; src_b += src_stride_b; dst_rgb += dst_stride_rgb; } } LIBYUV_NOINLINE void SplitARGBPlaneAlpha(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_r, int dst_stride_r, uint8_t* dst_g, int dst_stride_g, uint8_t* dst_b, int dst_stride_b, uint8_t* dst_a, int dst_stride_a, int width, int height) { int y; void (*SplitARGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width) = SplitARGBRow_C; assert(height > 0); if (src_stride_argb == width * 4 && dst_stride_r == width && dst_stride_g == width && dst_stride_b == width && dst_stride_a == width) { width *= height; height = 1; src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = dst_stride_a = 0; } #if defined(HAS_SPLITARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SplitARGBRow = SplitARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { SplitARGBRow = SplitARGBRow_SSE2; } } #endif #if defined(HAS_SPLITARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { SplitARGBRow = SplitARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { SplitARGBRow = SplitARGBRow_SSSE3; } } #endif #if defined(HAS_SPLITARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { SplitARGBRow = SplitARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { SplitARGBRow = SplitARGBRow_AVX2; } } #endif #if defined(HAS_SPLITARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SplitARGBRow = SplitARGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { SplitARGBRow = SplitARGBRow_NEON; } } #endif for (y = 0; y < height; ++y) { SplitARGBRow(src_argb, dst_r, dst_g, dst_b, dst_a, width); dst_r += dst_stride_r; dst_g += dst_stride_g; dst_b += dst_stride_b; dst_a += dst_stride_a; src_argb += src_stride_argb; } } LIBYUV_NOINLINE void SplitARGBPlaneOpaque(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_r, int dst_stride_r, uint8_t* dst_g, int dst_stride_g, uint8_t* dst_b, int dst_stride_b, int width, int height) { int y; void (*SplitXRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) = SplitXRGBRow_C; assert(height > 0); if (src_stride_argb == width * 4 && dst_stride_r == width && dst_stride_g == width && dst_stride_b == width) { width *= height; height = 1; src_stride_argb = dst_stride_r = dst_stride_g = dst_stride_b = 0; } #if defined(HAS_SPLITXRGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SplitXRGBRow = SplitXRGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { SplitXRGBRow = SplitXRGBRow_SSE2; } } #endif #if defined(HAS_SPLITXRGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { SplitXRGBRow = SplitXRGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { SplitXRGBRow = SplitXRGBRow_SSSE3; } } #endif #if defined(HAS_SPLITXRGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { SplitXRGBRow = SplitXRGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { SplitXRGBRow = SplitXRGBRow_AVX2; } } #endif #if defined(HAS_SPLITXRGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SplitXRGBRow = SplitXRGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { SplitXRGBRow = SplitXRGBRow_NEON; } } #endif for (y = 0; y < height; ++y) { SplitXRGBRow(src_argb, dst_r, dst_g, dst_b, width); dst_r += dst_stride_r; dst_g += dst_stride_g; dst_b += dst_stride_b; src_argb += src_stride_argb; } } LIBYUV_API void SplitARGBPlane(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_r, int dst_stride_r, uint8_t* dst_g, int dst_stride_g, uint8_t* dst_b, int dst_stride_b, uint8_t* dst_a, int dst_stride_a, int width, int height) { // Negative height means invert the image. if (height < 0) { height = -height; dst_r = dst_r + (height - 1) * dst_stride_r; dst_g = dst_g + (height - 1) * dst_stride_g; dst_b = dst_b + (height - 1) * dst_stride_b; dst_a = dst_a + (height - 1) * dst_stride_a; dst_stride_r = -dst_stride_r; dst_stride_g = -dst_stride_g; dst_stride_b = -dst_stride_b; dst_stride_a = -dst_stride_a; } if (dst_a == NULL) { SplitARGBPlaneOpaque(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g, dst_stride_g, dst_b, dst_stride_b, width, height); } else { SplitARGBPlaneAlpha(src_argb, src_stride_argb, dst_r, dst_stride_r, dst_g, dst_stride_g, dst_b, dst_stride_b, dst_a, dst_stride_a, width, height); } } LIBYUV_NOINLINE void MergeARGBPlaneAlpha(const uint8_t* src_r, int src_stride_r, const uint8_t* src_g, int src_stride_g, const uint8_t* src_b, int src_stride_b, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*MergeARGBRow)(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, uint8_t* dst_argb, int width) = MergeARGBRow_C; assert(height > 0); if (src_stride_r == width && src_stride_g == width && src_stride_b == width && src_stride_a == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_r = src_stride_g = src_stride_b = src_stride_a = dst_stride_argb = 0; } #if defined(HAS_MERGEARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeARGBRow = MergeARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { MergeARGBRow = MergeARGBRow_SSE2; } } #endif #if defined(HAS_MERGEARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeARGBRow = MergeARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { MergeARGBRow = MergeARGBRow_AVX2; } } #endif #if defined(HAS_MERGEARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeARGBRow = MergeARGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { MergeARGBRow = MergeARGBRow_NEON; } } #endif for (y = 0; y < height; ++y) { MergeARGBRow(src_r, src_g, src_b, src_a, dst_argb, width); src_r += src_stride_r; src_g += src_stride_g; src_b += src_stride_b; src_a += src_stride_a; dst_argb += dst_stride_argb; } } LIBYUV_NOINLINE void MergeARGBPlaneOpaque(const uint8_t* src_r, int src_stride_r, const uint8_t* src_g, int src_stride_g, const uint8_t* src_b, int src_stride_b, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*MergeXRGBRow)(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_argb, int width) = MergeXRGBRow_C; assert(height > 0); if (src_stride_r == width && src_stride_g == width && src_stride_b == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; } #if defined(HAS_MERGEXRGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { MergeXRGBRow = MergeXRGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { MergeXRGBRow = MergeXRGBRow_SSE2; } } #endif #if defined(HAS_MERGEXRGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeXRGBRow = MergeXRGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { MergeXRGBRow = MergeXRGBRow_AVX2; } } #endif #if defined(HAS_MERGEXRGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeXRGBRow = MergeXRGBRow_Any_NEON; if (IS_ALIGNED(width, 16)) { MergeXRGBRow = MergeXRGBRow_NEON; } } #endif for (y = 0; y < height; ++y) { MergeXRGBRow(src_r, src_g, src_b, dst_argb, width); src_r += src_stride_r; src_g += src_stride_g; src_b += src_stride_b; dst_argb += dst_stride_argb; } } LIBYUV_API void MergeARGBPlane(const uint8_t* src_r, int src_stride_r, const uint8_t* src_g, int src_stride_g, const uint8_t* src_b, int src_stride_b, const uint8_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } if (src_a == NULL) { MergeARGBPlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, src_stride_b, dst_argb, dst_stride_argb, width, height); } else { MergeARGBPlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, src_stride_b, src_a, src_stride_a, dst_argb, dst_stride_argb, width, height); } } // TODO(yuan): Support 2 bit alpha channel. LIBYUV_API void MergeXR30Plane(const uint16_t* src_r, int src_stride_r, const uint16_t* src_g, int src_stride_g, const uint16_t* src_b, int src_stride_b, uint8_t* dst_ar30, int dst_stride_ar30, int width, int height, int depth) { int y; void (*MergeXR30Row)(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_ar30, int depth, int width) = MergeXR30Row_C; // Negative height means invert the image. if (height < 0) { height = -height; dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } // Coalesce rows. if (src_stride_r == width && src_stride_g == width && src_stride_b == width && dst_stride_ar30 == width * 4) { width *= height; height = 1; src_stride_r = src_stride_g = src_stride_b = dst_stride_ar30 = 0; } #if defined(HAS_MERGEXR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeXR30Row = MergeXR30Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { MergeXR30Row = MergeXR30Row_AVX2; } } #endif #if defined(HAS_MERGEXR30ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { if (depth == 10) { MergeXR30Row = MergeXR30Row_10_Any_NEON; if (IS_ALIGNED(width, 8)) { MergeXR30Row = MergeXR30Row_10_NEON; } } else { MergeXR30Row = MergeXR30Row_Any_NEON; if (IS_ALIGNED(width, 8)) { MergeXR30Row = MergeXR30Row_NEON; } } } #endif for (y = 0; y < height; ++y) { MergeXR30Row(src_r, src_g, src_b, dst_ar30, depth, width); src_r += src_stride_r; src_g += src_stride_g; src_b += src_stride_b; dst_ar30 += dst_stride_ar30; } } LIBYUV_NOINLINE static void MergeAR64PlaneAlpha(const uint16_t* src_r, int src_stride_r, const uint16_t* src_g, int src_stride_g, const uint16_t* src_b, int src_stride_b, const uint16_t* src_a, int src_stride_a, uint16_t* dst_ar64, int dst_stride_ar64, int width, int height, int depth) { int y; void (*MergeAR64Row)(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint16_t* dst_argb, int depth, int width) = MergeAR64Row_C; if (src_stride_r == width && src_stride_g == width && src_stride_b == width && src_stride_a == width && dst_stride_ar64 == width * 4) { width *= height; height = 1; src_stride_r = src_stride_g = src_stride_b = src_stride_a = dst_stride_ar64 = 0; } #if defined(HAS_MERGEAR64ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeAR64Row = MergeAR64Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { MergeAR64Row = MergeAR64Row_AVX2; } } #endif #if defined(HAS_MERGEAR64ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeAR64Row = MergeAR64Row_Any_NEON; if (IS_ALIGNED(width, 8)) { MergeAR64Row = MergeAR64Row_NEON; } } #endif for (y = 0; y < height; ++y) { MergeAR64Row(src_r, src_g, src_b, src_a, dst_ar64, depth, width); src_r += src_stride_r; src_g += src_stride_g; src_b += src_stride_b; src_a += src_stride_a; dst_ar64 += dst_stride_ar64; } } LIBYUV_NOINLINE static void MergeAR64PlaneOpaque(const uint16_t* src_r, int src_stride_r, const uint16_t* src_g, int src_stride_g, const uint16_t* src_b, int src_stride_b, uint16_t* dst_ar64, int dst_stride_ar64, int width, int height, int depth) { int y; void (*MergeXR64Row)(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint16_t* dst_argb, int depth, int width) = MergeXR64Row_C; // Coalesce rows. if (src_stride_r == width && src_stride_g == width && src_stride_b == width && dst_stride_ar64 == width * 4) { width *= height; height = 1; src_stride_r = src_stride_g = src_stride_b = dst_stride_ar64 = 0; } #if defined(HAS_MERGEXR64ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeXR64Row = MergeXR64Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { MergeXR64Row = MergeXR64Row_AVX2; } } #endif #if defined(HAS_MERGEXR64ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeXR64Row = MergeXR64Row_Any_NEON; if (IS_ALIGNED(width, 8)) { MergeXR64Row = MergeXR64Row_NEON; } } #endif for (y = 0; y < height; ++y) { MergeXR64Row(src_r, src_g, src_b, dst_ar64, depth, width); src_r += src_stride_r; src_g += src_stride_g; src_b += src_stride_b; dst_ar64 += dst_stride_ar64; } } LIBYUV_API void MergeAR64Plane(const uint16_t* src_r, int src_stride_r, const uint16_t* src_g, int src_stride_g, const uint16_t* src_b, int src_stride_b, const uint16_t* src_a, int src_stride_a, uint16_t* dst_ar64, int dst_stride_ar64, int width, int height, int depth) { // Negative height means invert the image. if (height < 0) { height = -height; dst_ar64 = dst_ar64 + (height - 1) * dst_stride_ar64; dst_stride_ar64 = -dst_stride_ar64; } if (src_a == NULL) { MergeAR64PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, src_stride_b, dst_ar64, dst_stride_ar64, width, height, depth); } else { MergeAR64PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, src_stride_b, src_a, src_stride_a, dst_ar64, dst_stride_ar64, width, height, depth); } } LIBYUV_NOINLINE static void MergeARGB16To8PlaneAlpha(const uint16_t* src_r, int src_stride_r, const uint16_t* src_g, int src_stride_g, const uint16_t* src_b, int src_stride_b, const uint16_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int depth) { int y; void (*MergeARGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint8_t* dst_argb, int depth, int width) = MergeARGB16To8Row_C; if (src_stride_r == width && src_stride_g == width && src_stride_b == width && src_stride_a == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_r = src_stride_g = src_stride_b = src_stride_a = dst_stride_argb = 0; } #if defined(HAS_MERGEARGB16TO8ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeARGB16To8Row = MergeARGB16To8Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { MergeARGB16To8Row = MergeARGB16To8Row_AVX2; } } #endif #if defined(HAS_MERGEARGB16TO8ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeARGB16To8Row = MergeARGB16To8Row_Any_NEON; if (IS_ALIGNED(width, 8)) { MergeARGB16To8Row = MergeARGB16To8Row_NEON; } } #endif for (y = 0; y < height; ++y) { MergeARGB16To8Row(src_r, src_g, src_b, src_a, dst_argb, depth, width); src_r += src_stride_r; src_g += src_stride_g; src_b += src_stride_b; src_a += src_stride_a; dst_argb += dst_stride_argb; } } LIBYUV_NOINLINE static void MergeARGB16To8PlaneOpaque(const uint16_t* src_r, int src_stride_r, const uint16_t* src_g, int src_stride_g, const uint16_t* src_b, int src_stride_b, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int depth) { int y; void (*MergeXRGB16To8Row)(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_argb, int depth, int width) = MergeXRGB16To8Row_C; // Coalesce rows. if (src_stride_r == width && src_stride_g == width && src_stride_b == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_r = src_stride_g = src_stride_b = dst_stride_argb = 0; } #if defined(HAS_MERGEXRGB16TO8ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MergeXRGB16To8Row = MergeXRGB16To8Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { MergeXRGB16To8Row = MergeXRGB16To8Row_AVX2; } } #endif #if defined(HAS_MERGEXRGB16TO8ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MergeXRGB16To8Row = MergeXRGB16To8Row_Any_NEON; if (IS_ALIGNED(width, 8)) { MergeXRGB16To8Row = MergeXRGB16To8Row_NEON; } } #endif for (y = 0; y < height; ++y) { MergeXRGB16To8Row(src_r, src_g, src_b, dst_argb, depth, width); src_r += src_stride_r; src_g += src_stride_g; src_b += src_stride_b; dst_argb += dst_stride_argb; } } LIBYUV_API void MergeARGB16To8Plane(const uint16_t* src_r, int src_stride_r, const uint16_t* src_g, int src_stride_g, const uint16_t* src_b, int src_stride_b, const uint16_t* src_a, int src_stride_a, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int depth) { // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } if (src_a == NULL) { MergeARGB16To8PlaneOpaque(src_r, src_stride_r, src_g, src_stride_g, src_b, src_stride_b, dst_argb, dst_stride_argb, width, height, depth); } else { MergeARGB16To8PlaneAlpha(src_r, src_stride_r, src_g, src_stride_g, src_b, src_stride_b, src_a, src_stride_a, dst_argb, dst_stride_argb, width, height, depth); } } // Convert YUY2 to I422. LIBYUV_API int YUY2ToI422(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) = YUY2ToUV422Row_C; void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = YUY2ToYRow_C; if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } // Coalesce rows. if (src_stride_yuy2 == width * 2 && dst_stride_y == width && dst_stride_u * 2 == width && dst_stride_v * 2 == width && width * height <= 32768) { width *= height; height = 1; src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_YUY2TOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { YUY2ToUV422Row = YUY2ToUV422Row_SSE2; YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif #if defined(HAS_YUY2TOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2; YUY2ToYRow = YUY2ToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { YUY2ToUV422Row = YUY2ToUV422Row_AVX2; YUY2ToYRow = YUY2ToYRow_AVX2; } } #endif #if defined(HAS_YUY2TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { YUY2ToYRow = YUY2ToYRow_Any_NEON; YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { YUY2ToYRow = YUY2ToYRow_NEON; YUY2ToUV422Row = YUY2ToUV422Row_NEON; } } #endif #if defined(HAS_YUY2TOYROW_MMI) && defined(HAS_YUY2TOUV422ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { YUY2ToYRow = YUY2ToYRow_Any_MMI; YUY2ToUV422Row = YUY2ToUV422Row_Any_MMI; if (IS_ALIGNED(width, 8)) { YUY2ToYRow = YUY2ToYRow_MMI; YUY2ToUV422Row = YUY2ToUV422Row_MMI; } } #endif #if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { YUY2ToYRow = YUY2ToYRow_Any_MSA; YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA; if (IS_ALIGNED(width, 32)) { YUY2ToYRow = YUY2ToYRow_MSA; YUY2ToUV422Row = YUY2ToUV422Row_MSA; } } #endif for (y = 0; y < height; ++y) { YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width); YUY2ToYRow(src_yuy2, dst_y, width); src_yuy2 += src_stride_yuy2; dst_y += dst_stride_y; dst_u += dst_stride_u; dst_v += dst_stride_v; } return 0; } // Convert UYVY to I422. LIBYUV_API int UYVYToI422(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) = UYVYToUV422Row_C; void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) = UYVYToYRow_C; if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } // Coalesce rows. if (src_stride_uyvy == width * 2 && dst_stride_y == width && dst_stride_u * 2 == width && dst_stride_v * 2 == width && width * height <= 32768) { width *= height; height = 1; src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0; } #if defined(HAS_UYVYTOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; UYVYToYRow = UYVYToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { UYVYToUV422Row = UYVYToUV422Row_SSE2; UYVYToYRow = UYVYToYRow_SSE2; } } #endif #if defined(HAS_UYVYTOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { UYVYToUV422Row = UYVYToUV422Row_Any_AVX2; UYVYToYRow = UYVYToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { UYVYToUV422Row = UYVYToUV422Row_AVX2; UYVYToYRow = UYVYToYRow_AVX2; } } #endif #if defined(HAS_UYVYTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { UYVYToYRow = UYVYToYRow_Any_NEON; UYVYToUV422Row = UYVYToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_NEON; UYVYToUV422Row = UYVYToUV422Row_NEON; } } #endif #if defined(HAS_UYVYTOYROW_MMI) && defined(HAS_UYVYTOUV422ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { UYVYToYRow = UYVYToYRow_Any_MMI; UYVYToUV422Row = UYVYToUV422Row_Any_MMI; if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_MMI; UYVYToUV422Row = UYVYToUV422Row_MMI; } } #endif #if defined(HAS_UYVYTOYROW_MSA) && defined(HAS_UYVYTOUV422ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { UYVYToYRow = UYVYToYRow_Any_MSA; UYVYToUV422Row = UYVYToUV422Row_Any_MSA; if (IS_ALIGNED(width, 32)) { UYVYToYRow = UYVYToYRow_MSA; UYVYToUV422Row = UYVYToUV422Row_MSA; } } #endif for (y = 0; y < height; ++y) { UYVYToUV422Row(src_uyvy, dst_u, dst_v, width); UYVYToYRow(src_uyvy, dst_y, width); src_uyvy += src_stride_uyvy; dst_y += dst_stride_y; dst_u += dst_stride_u; dst_v += dst_stride_v; } return 0; } // Convert YUY2 to Y. LIBYUV_API int YUY2ToY(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_y, int dst_stride_y, int width, int height) { int y; void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) = YUY2ToYRow_C; if (!src_yuy2 || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } // Coalesce rows. if (src_stride_yuy2 == width * 2 && dst_stride_y == width) { width *= height; height = 1; src_stride_yuy2 = dst_stride_y = 0; } #if defined(HAS_YUY2TOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { YUY2ToYRow = YUY2ToYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { YUY2ToYRow = YUY2ToYRow_SSE2; } } #endif #if defined(HAS_YUY2TOYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { YUY2ToYRow = YUY2ToYRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { YUY2ToYRow = YUY2ToYRow_AVX2; } } #endif #if defined(HAS_YUY2TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { YUY2ToYRow = YUY2ToYRow_Any_NEON; if (IS_ALIGNED(width, 16)) { YUY2ToYRow = YUY2ToYRow_NEON; } } #endif #if defined(HAS_YUY2TOYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { YUY2ToYRow = YUY2ToYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { YUY2ToYRow = YUY2ToYRow_MMI; } } #endif #if defined(HAS_YUY2TOYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { YUY2ToYRow = YUY2ToYRow_Any_MSA; if (IS_ALIGNED(width, 32)) { YUY2ToYRow = YUY2ToYRow_MSA; } } #endif for (y = 0; y < height; ++y) { YUY2ToYRow(src_yuy2, dst_y, width); src_yuy2 += src_stride_yuy2; dst_y += dst_stride_y; } return 0; } // Mirror a plane of data. // See Also I400Mirror LIBYUV_API void MirrorPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int width, int height) { int y; void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } #if defined(HAS_MIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MirrorRow = MirrorRow_Any_NEON; if (IS_ALIGNED(width, 32)) { MirrorRow = MirrorRow_NEON; } } #endif #if defined(HAS_MIRRORROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { MirrorRow = MirrorRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { MirrorRow = MirrorRow_SSSE3; } } #endif #if defined(HAS_MIRRORROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MirrorRow = MirrorRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { MirrorRow = MirrorRow_AVX2; } } #endif #if defined(HAS_MIRRORROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MirrorRow = MirrorRow_Any_MMI; if (IS_ALIGNED(width, 8)) { MirrorRow = MirrorRow_MMI; } } #endif #if defined(HAS_MIRRORROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MirrorRow = MirrorRow_Any_MSA; if (IS_ALIGNED(width, 64)) { MirrorRow = MirrorRow_MSA; } } #endif // Mirror plane for (y = 0; y < height; ++y) { MirrorRow(src_y, dst_y, width); src_y += src_stride_y; dst_y += dst_stride_y; } } // Mirror a plane of UV data. LIBYUV_API void MirrorUVPlane(const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorUVRow_C; // Negative height means invert the image. if (height < 0) { height = -height; src_uv = src_uv + (height - 1) * src_stride_uv; src_stride_uv = -src_stride_uv; } #if defined(HAS_MIRRORUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MirrorUVRow = MirrorUVRow_Any_NEON; if (IS_ALIGNED(width, 32)) { MirrorUVRow = MirrorUVRow_NEON; } } #endif #if defined(HAS_MIRRORUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { MirrorUVRow = MirrorUVRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { MirrorUVRow = MirrorUVRow_SSSE3; } } #endif #if defined(HAS_MIRRORUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MirrorUVRow = MirrorUVRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { MirrorUVRow = MirrorUVRow_AVX2; } } #endif #if defined(HAS_MIRRORUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MirrorUVRow = MirrorUVRow_Any_MSA; if (IS_ALIGNED(width, 8)) { MirrorUVRow = MirrorUVRow_MSA; } } #endif // MirrorUV plane for (y = 0; y < height; ++y) { MirrorUVRow(src_uv, dst_uv, width); src_uv += src_stride_uv; dst_uv += dst_stride_uv; } } // Mirror I400 with optional flipping LIBYUV_API int I400Mirror(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, int dst_stride_y, int width, int height) { if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); return 0; } // Mirror I420 with optional flipping LIBYUV_API int I420Mirror(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (halfheight - 1) * src_stride_u; src_v = src_v + (halfheight - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } if (dst_y) { MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; } // NV12 mirror. LIBYUV_API int NV12Mirror(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_y || !src_uv || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_uv = src_uv + (halfheight - 1) * src_stride_uv; src_stride_y = -src_stride_y; src_stride_uv = -src_stride_uv; } if (dst_y) { MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth, halfheight); return 0; } // ARGB mirror. LIBYUV_API int ARGBMirror(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) = ARGBMirrorRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBMirrorRow = ARGBMirrorRow_NEON; } } #endif #if defined(HAS_ARGBMIRRORROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBMirrorRow = ARGBMirrorRow_SSE2; } } #endif #if defined(HAS_ARGBMIRRORROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBMirrorRow = ARGBMirrorRow_AVX2; } } #endif #if defined(HAS_ARGBMIRRORROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBMirrorRow = ARGBMirrorRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBMirrorRow = ARGBMirrorRow_MMI; } } #endif #if defined(HAS_ARGBMIRRORROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBMirrorRow = ARGBMirrorRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBMirrorRow = ARGBMirrorRow_MSA; } } #endif // Mirror plane for (y = 0; y < height; ++y) { ARGBMirrorRow(src_argb, dst_argb, width); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } // RGB24 mirror. LIBYUV_API int RGB24Mirror(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height) { int y; void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = RGB24MirrorRow_C; if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; src_stride_rgb24 = -src_stride_rgb24; } #if defined(HAS_RGB24MIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB24MirrorRow = RGB24MirrorRow_Any_NEON; if (IS_ALIGNED(width, 16)) { RGB24MirrorRow = RGB24MirrorRow_NEON; } } #endif #if defined(HAS_RGB24MIRRORROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { RGB24MirrorRow = RGB24MirrorRow_SSSE3; } } #endif // Mirror plane for (y = 0; y < height; ++y) { RGB24MirrorRow(src_rgb24, dst_rgb24, width); src_rgb24 += src_stride_rgb24; dst_rgb24 += dst_stride_rgb24; } return 0; } // Get a blender that optimized for the CPU and pixel count. // As there are 6 blenders to choose from, the caller should try to use // the same blend function for all pixels if possible. LIBYUV_API ARGBBlendRow GetARGBBlend() { void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) = ARGBBlendRow_C; #if defined(HAS_ARGBBLENDROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBBlendRow = ARGBBlendRow_SSSE3; return ARGBBlendRow; } #endif #if defined(HAS_ARGBBLENDROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBBlendRow = ARGBBlendRow_NEON; } #endif #if defined(HAS_ARGBBLENDROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBBlendRow = ARGBBlendRow_MMI; } #endif #if defined(HAS_ARGBBLENDROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBBlendRow = ARGBBlendRow_MSA; } #endif return ARGBBlendRow; } // Alpha Blend 2 ARGB images and store to destination. LIBYUV_API int ARGBBlend(const uint8_t* src_argb0, int src_stride_argb0, const uint8_t* src_argb1, int src_stride_argb1, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) = GetARGBBlend(); if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } // Coalesce rows. if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } for (y = 0; y < height; ++y) { ARGBBlendRow(src_argb0, src_argb1, dst_argb, width); src_argb0 += src_stride_argb0; src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; } return 0; } // Alpha Blend plane and store to destination. LIBYUV_API int BlendPlane(const uint8_t* src_y0, int src_stride_y0, const uint8_t* src_y1, int src_stride_y1, const uint8_t* alpha, int alpha_stride, uint8_t* dst_y, int dst_stride_y, int width, int height) { int y; void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width) = BlendPlaneRow_C; if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; dst_stride_y = -dst_stride_y; } // Coalesce rows for Y plane. if (src_stride_y0 == width && src_stride_y1 == width && alpha_stride == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0; } #if defined(HAS_BLENDPLANEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { BlendPlaneRow = BlendPlaneRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { BlendPlaneRow = BlendPlaneRow_SSSE3; } } #endif #if defined(HAS_BLENDPLANEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { BlendPlaneRow = BlendPlaneRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { BlendPlaneRow = BlendPlaneRow_AVX2; } } #endif #if defined(HAS_BLENDPLANEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { BlendPlaneRow = BlendPlaneRow_Any_MMI; if (IS_ALIGNED(width, 8)) { BlendPlaneRow = BlendPlaneRow_MMI; } } #endif for (y = 0; y < height; ++y) { BlendPlaneRow(src_y0, src_y1, alpha, dst_y, width); src_y0 += src_stride_y0; src_y1 += src_stride_y1; alpha += alpha_stride; dst_y += dst_stride_y; } return 0; } #define MAXTWIDTH 2048 // Alpha Blend YUV images and store to destination. LIBYUV_API int I420Blend(const uint8_t* src_y0, int src_stride_y0, const uint8_t* src_u0, int src_stride_u0, const uint8_t* src_v0, int src_stride_v0, const uint8_t* src_y1, int src_stride_y1, const uint8_t* src_u1, int src_stride_u1, const uint8_t* src_v1, int src_stride_v1, const uint8_t* alpha, int alpha_stride, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height) { int y; // Half width/height for UV. int halfwidth = (width + 1) >> 1; void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width) = BlendPlaneRow_C; void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C; if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 || !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; dst_stride_y = -dst_stride_y; } // Blend Y plane. BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride, dst_y, dst_stride_y, width, height); #if defined(HAS_BLENDPLANEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { BlendPlaneRow = BlendPlaneRow_Any_SSSE3; if (IS_ALIGNED(halfwidth, 8)) { BlendPlaneRow = BlendPlaneRow_SSSE3; } } #endif #if defined(HAS_BLENDPLANEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { BlendPlaneRow = BlendPlaneRow_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { BlendPlaneRow = BlendPlaneRow_AVX2; } } #endif #if defined(HAS_BLENDPLANEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { BlendPlaneRow = BlendPlaneRow_Any_MMI; if (IS_ALIGNED(halfwidth, 8)) { BlendPlaneRow = BlendPlaneRow_MMI; } } #endif if (!IS_ALIGNED(width, 2)) { ScaleRowDown2 = ScaleRowDown2Box_Odd_C; } #if defined(HAS_SCALEROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleRowDown2 = ScaleRowDown2Box_Odd_NEON; if (IS_ALIGNED(width, 2)) { ScaleRowDown2 = ScaleRowDown2Box_Any_NEON; if (IS_ALIGNED(halfwidth, 16)) { ScaleRowDown2 = ScaleRowDown2Box_NEON; } } } #endif #if defined(HAS_SCALEROWDOWN2_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowDown2 = ScaleRowDown2Box_Odd_SSSE3; if (IS_ALIGNED(width, 2)) { ScaleRowDown2 = ScaleRowDown2Box_Any_SSSE3; if (IS_ALIGNED(halfwidth, 16)) { ScaleRowDown2 = ScaleRowDown2Box_SSSE3; } } } #endif #if defined(HAS_SCALEROWDOWN2_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ScaleRowDown2 = ScaleRowDown2Box_Odd_AVX2; if (IS_ALIGNED(width, 2)) { ScaleRowDown2 = ScaleRowDown2Box_Any_AVX2; if (IS_ALIGNED(halfwidth, 32)) { ScaleRowDown2 = ScaleRowDown2Box_AVX2; } } } #endif #if defined(HAS_SCALEROWDOWN2_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleRowDown2 = ScaleRowDown2Box_Odd_MMI; if (IS_ALIGNED(width, 2)) { ScaleRowDown2 = ScaleRowDown2Box_Any_MMI; if (IS_ALIGNED(halfwidth, 8)) { ScaleRowDown2 = ScaleRowDown2Box_MMI; } } } #endif // Row buffer for intermediate alpha pixels. align_buffer_64(halfalpha, halfwidth); for (y = 0; y < height; y += 2) { // last row of odd height image use 1 row of alpha instead of 2. if (y == (height - 1)) { alpha_stride = 0; } // Subsample 2 rows of UV to half width and half height. ScaleRowDown2(alpha, alpha_stride, halfalpha, halfwidth); alpha += alpha_stride * 2; BlendPlaneRow(src_u0, src_u1, halfalpha, dst_u, halfwidth); BlendPlaneRow(src_v0, src_v1, halfalpha, dst_v, halfwidth); src_u0 += src_stride_u0; src_u1 += src_stride_u1; dst_u += dst_stride_u; src_v0 += src_stride_v0; src_v1 += src_stride_v1; dst_v += dst_stride_v; } free_aligned_buffer_64(halfalpha); return 0; } // Multiply 2 ARGB images and store to destination. LIBYUV_API int ARGBMultiply(const uint8_t* src_argb0, int src_stride_argb0, const uint8_t* src_argb1, int src_stride_argb1, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst, int width) = ARGBMultiplyRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } // Coalesce rows. if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } #if defined(HAS_ARGBMULTIPLYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBMultiplyRow = ARGBMultiplyRow_SSE2; } } #endif #if defined(HAS_ARGBMULTIPLYROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBMultiplyRow = ARGBMultiplyRow_AVX2; } } #endif #if defined(HAS_ARGBMULTIPLYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBMultiplyRow = ARGBMultiplyRow_NEON; } } #endif #if defined(HAS_ARGBMULTIPLYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBMultiplyRow = ARGBMultiplyRow_MMI; } } #endif #if defined(HAS_ARGBMULTIPLYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA; if (IS_ALIGNED(width, 4)) { ARGBMultiplyRow = ARGBMultiplyRow_MSA; } } #endif // Multiply plane for (y = 0; y < height; ++y) { ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width); src_argb0 += src_stride_argb0; src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; } return 0; } // Add 2 ARGB images and store to destination. LIBYUV_API int ARGBAdd(const uint8_t* src_argb0, int src_stride_argb0, const uint8_t* src_argb1, int src_stride_argb1, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst, int width) = ARGBAddRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } // Coalesce rows. if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } #if defined(HAS_ARGBADDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBAddRow = ARGBAddRow_SSE2; } #endif #if defined(HAS_ARGBADDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBAddRow = ARGBAddRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBAddRow = ARGBAddRow_SSE2; } } #endif #if defined(HAS_ARGBADDROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBAddRow = ARGBAddRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAddRow = ARGBAddRow_AVX2; } } #endif #if defined(HAS_ARGBADDROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBAddRow = ARGBAddRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBAddRow = ARGBAddRow_NEON; } } #endif #if defined(HAS_ARGBADDROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBAddRow = ARGBAddRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBAddRow = ARGBAddRow_MMI; } } #endif #if defined(HAS_ARGBADDROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBAddRow = ARGBAddRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBAddRow = ARGBAddRow_MSA; } } #endif // Add plane for (y = 0; y < height; ++y) { ARGBAddRow(src_argb0, src_argb1, dst_argb, width); src_argb0 += src_stride_argb0; src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; } return 0; } // Subtract 2 ARGB images and store to destination. LIBYUV_API int ARGBSubtract(const uint8_t* src_argb0, int src_stride_argb0, const uint8_t* src_argb1, int src_stride_argb1, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst, int width) = ARGBSubtractRow_C; if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } // Coalesce rows. if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } #if defined(HAS_ARGBSUBTRACTROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBSubtractRow = ARGBSubtractRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBSubtractRow = ARGBSubtractRow_SSE2; } } #endif #if defined(HAS_ARGBSUBTRACTROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBSubtractRow = ARGBSubtractRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBSubtractRow = ARGBSubtractRow_AVX2; } } #endif #if defined(HAS_ARGBSUBTRACTROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBSubtractRow = ARGBSubtractRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBSubtractRow = ARGBSubtractRow_NEON; } } #endif #if defined(HAS_ARGBSUBTRACTROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBSubtractRow = ARGBSubtractRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBSubtractRow = ARGBSubtractRow_MMI; } } #endif #if defined(HAS_ARGBSUBTRACTROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBSubtractRow = ARGBSubtractRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBSubtractRow = ARGBSubtractRow_MSA; } } #endif // Subtract plane for (y = 0; y < height; ++y) { ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width); src_argb0 += src_stride_argb0; src_argb1 += src_stride_argb1; dst_argb += dst_stride_argb; } return 0; } // Convert RAW to RGB24. LIBYUV_API int RAWToRGB24(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_rgb24, int dst_stride_rgb24, int width, int height) { int y; void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) = RAWToRGB24Row_C; if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_raw = src_raw + (height - 1) * src_stride_raw; src_stride_raw = -src_stride_raw; } // Coalesce rows. if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) { width *= height; height = 1; src_stride_raw = dst_stride_rgb24 = 0; } #if defined(HAS_RAWTORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToRGB24Row = RAWToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { RAWToRGB24Row = RAWToRGB24Row_SSSE3; } } #endif #if defined(HAS_RAWTORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RAWToRGB24Row = RAWToRGB24Row_Any_NEON; if (IS_ALIGNED(width, 8)) { RAWToRGB24Row = RAWToRGB24Row_NEON; } } #endif #if defined(HAS_RAWTORGB24ROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { RAWToRGB24Row = RAWToRGB24Row_Any_MMI; if (IS_ALIGNED(width, 4)) { RAWToRGB24Row = RAWToRGB24Row_MMI; } } #endif #if defined(HAS_RAWTORGB24ROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { RAWToRGB24Row = RAWToRGB24Row_Any_MSA; if (IS_ALIGNED(width, 16)) { RAWToRGB24Row = RAWToRGB24Row_MSA; } } #endif for (y = 0; y < height; ++y) { RAWToRGB24Row(src_raw, dst_rgb24, width); src_raw += src_stride_raw; dst_rgb24 += dst_stride_rgb24; } return 0; } LIBYUV_API void SetPlane(uint8_t* dst_y, int dst_stride_y, int width, int height, uint32_t value) { int y; void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C; if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; dst_stride_y = -dst_stride_y; } // Coalesce rows. if (dst_stride_y == width) { width *= height; height = 1; dst_stride_y = 0; } #if defined(HAS_SETROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SetRow = SetRow_Any_NEON; if (IS_ALIGNED(width, 16)) { SetRow = SetRow_NEON; } } #endif #if defined(HAS_SETROW_X86) if (TestCpuFlag(kCpuHasX86)) { SetRow = SetRow_Any_X86; if (IS_ALIGNED(width, 4)) { SetRow = SetRow_X86; } } #endif #if defined(HAS_SETROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { SetRow = SetRow_ERMS; } #endif #if defined(HAS_SETROW_MSA) if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) { SetRow = SetRow_MSA; } #endif // Set plane for (y = 0; y < height; ++y) { SetRow(dst_y, value, width); dst_y += dst_stride_y; } } // Draw a rectangle into I420 LIBYUV_API int I420Rect(uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int x, int y, int width, int height, int value_y, int value_u, int value_v) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; uint8_t* start_y = dst_y + y * dst_stride_y + x; uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 || y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 || value_v < 0 || value_v > 255) { return -1; } SetPlane(start_y, dst_stride_y, width, height, value_y); SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u); SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v); return 0; } // Draw a rectangle into ARGB LIBYUV_API int ARGBRect(uint8_t* dst_argb, int dst_stride_argb, int dst_x, int dst_y, int width, int height, uint32_t value) { int y; void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) = ARGBSetRow_C; if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { return -1; } if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } dst_argb += dst_y * dst_stride_argb + dst_x * 4; // Coalesce rows. if (dst_stride_argb == width * 4) { width *= height; height = 1; dst_stride_argb = 0; } #if defined(HAS_ARGBSETROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBSetRow = ARGBSetRow_Any_NEON; if (IS_ALIGNED(width, 4)) { ARGBSetRow = ARGBSetRow_NEON; } } #endif #if defined(HAS_ARGBSETROW_X86) if (TestCpuFlag(kCpuHasX86)) { ARGBSetRow = ARGBSetRow_X86; } #endif #if defined(HAS_ARGBSETROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBSetRow = ARGBSetRow_Any_MMI; if (IS_ALIGNED(width, 4)) { ARGBSetRow = ARGBSetRow_MMI; } } #endif #if defined(HAS_ARGBSETROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBSetRow = ARGBSetRow_Any_MSA; if (IS_ALIGNED(width, 4)) { ARGBSetRow = ARGBSetRow_MSA; } } #endif // Set plane for (y = 0; y < height; ++y) { ARGBSetRow(dst_argb, value, width); dst_argb += dst_stride_argb; } return 0; } // Convert unattentuated ARGB to preattenuated ARGB. // An unattenutated ARGB alpha blend uses the formula // p = a * f + (1 - a) * b // where // p is output pixel // f is foreground pixel // b is background pixel // a is alpha value from foreground pixel // An preattenutated ARGB alpha blend uses the formula // p = f + (1 - a) * b // where // f is foreground pixel premultiplied by alpha LIBYUV_API int ARGBAttenuate(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBAttenuateRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBATTENUATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; if (IS_ALIGNED(width, 4)) { ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; } } #endif #if defined(HAS_ARGBATTENUATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_AVX2; } } #endif #if defined(HAS_ARGBATTENUATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_NEON; } } #endif #if defined(HAS_ARGBATTENUATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBAttenuateRow = ARGBAttenuateRow_MMI; } } #endif #if defined(HAS_ARGBATTENUATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBAttenuateRow = ARGBAttenuateRow_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBAttenuateRow(src_argb, dst_argb, width); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } // Convert preattentuated ARGB to unattenuated ARGB. LIBYUV_API int ARGBUnattenuate(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBUnattenuateRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBUNATTENUATEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; } } #endif #if defined(HAS_ARGBUNATTENUATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; } } #endif // TODO(fbarchard): Neon version. for (y = 0; y < height; ++y) { ARGBUnattenuateRow(src_argb, dst_argb, width); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } // Convert ARGB to Grayed ARGB. LIBYUV_API int ARGBGrayTo(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBGrayRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBGRAYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_SSSE3; } #endif #if defined(HAS_ARGBGRAYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_NEON; } #endif #if defined(HAS_ARGBGRAYROW_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { ARGBGrayRow = ARGBGrayRow_MMI; } #endif #if defined(HAS_ARGBGRAYROW_MSA) if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_MSA; } #endif for (y = 0; y < height; ++y) { ARGBGrayRow(src_argb, dst_argb, width); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } // Make a rectangle of ARGB gray scale. LIBYUV_API int ARGBGray(uint8_t* dst_argb, int dst_stride_argb, int dst_x, int dst_y, int width, int height) { int y; void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBGrayRow_C; uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } // Coalesce rows. if (dst_stride_argb == width * 4) { width *= height; height = 1; dst_stride_argb = 0; } #if defined(HAS_ARGBGRAYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_SSSE3; } #endif #if defined(HAS_ARGBGRAYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_NEON; } #endif #if defined(HAS_ARGBGRAYROW_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { ARGBGrayRow = ARGBGrayRow_MMI; } #endif #if defined(HAS_ARGBGRAYROW_MSA) if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { ARGBGrayRow = ARGBGrayRow_MSA; } #endif for (y = 0; y < height; ++y) { ARGBGrayRow(dst, dst, width); dst += dst_stride_argb; } return 0; } // Make a rectangle of ARGB Sepia tone. LIBYUV_API int ARGBSepia(uint8_t* dst_argb, int dst_stride_argb, int dst_x, int dst_y, int width, int height) { int y; void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C; uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } // Coalesce rows. if (dst_stride_argb == width * 4) { width *= height; height = 1; dst_stride_argb = 0; } #if defined(HAS_ARGBSEPIAROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBSepiaRow = ARGBSepiaRow_SSSE3; } #endif #if defined(HAS_ARGBSEPIAROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBSepiaRow = ARGBSepiaRow_NEON; } #endif #if defined(HAS_ARGBSEPIAROW_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { ARGBSepiaRow = ARGBSepiaRow_MMI; } #endif #if defined(HAS_ARGBSEPIAROW_MSA) if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { ARGBSepiaRow = ARGBSepiaRow_MSA; } #endif for (y = 0; y < height; ++y) { ARGBSepiaRow(dst, width); dst += dst_stride_argb; } return 0; } // Apply a 4x4 matrix to each ARGB pixel. // Note: Normally for shading, but can be used to swizzle or invert. LIBYUV_API int ARGBColorMatrix(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, const int8_t* matrix_argb, int width, int height) { int y; void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) = ARGBColorMatrixRow_C; if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBCOLORMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3; } #endif #if defined(HAS_ARGBCOLORMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_NEON; } #endif #if defined(HAS_ARGBCOLORMATRIXROW_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { ARGBColorMatrixRow = ARGBColorMatrixRow_MMI; } #endif #if defined(HAS_ARGBCOLORMATRIXROW_MSA) if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { ARGBColorMatrixRow = ARGBColorMatrixRow_MSA; } #endif for (y = 0; y < height; ++y) { ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } // Apply a 4x3 matrix to each ARGB pixel. // Deprecated. LIBYUV_API int RGBColorMatrix(uint8_t* dst_argb, int dst_stride_argb, const int8_t* matrix_rgb, int dst_x, int dst_y, int width, int height) { SIMD_ALIGNED(int8_t matrix_argb[16]); uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } // Convert 4x3 7 bit matrix to 4x4 6 bit matrix. matrix_argb[0] = matrix_rgb[0] / 2; matrix_argb[1] = matrix_rgb[1] / 2; matrix_argb[2] = matrix_rgb[2] / 2; matrix_argb[3] = matrix_rgb[3] / 2; matrix_argb[4] = matrix_rgb[4] / 2; matrix_argb[5] = matrix_rgb[5] / 2; matrix_argb[6] = matrix_rgb[6] / 2; matrix_argb[7] = matrix_rgb[7] / 2; matrix_argb[8] = matrix_rgb[8] / 2; matrix_argb[9] = matrix_rgb[9] / 2; matrix_argb[10] = matrix_rgb[10] / 2; matrix_argb[11] = matrix_rgb[11] / 2; matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0; matrix_argb[15] = 64; // 1.0 return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst, dst_stride_argb, &matrix_argb[0], width, height); } // Apply a color table each ARGB pixel. // Table contains 256 ARGB values. LIBYUV_API int ARGBColorTable(uint8_t* dst_argb, int dst_stride_argb, const uint8_t* table_argb, int dst_x, int dst_y, int width, int height) { int y; void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, int width) = ARGBColorTableRow_C; uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } // Coalesce rows. if (dst_stride_argb == width * 4) { width *= height; height = 1; dst_stride_argb = 0; } #if defined(HAS_ARGBCOLORTABLEROW_X86) if (TestCpuFlag(kCpuHasX86)) { ARGBColorTableRow = ARGBColorTableRow_X86; } #endif for (y = 0; y < height; ++y) { ARGBColorTableRow(dst, table_argb, width); dst += dst_stride_argb; } return 0; } // Apply a color table each ARGB pixel but preserve destination alpha. // Table contains 256 ARGB values. LIBYUV_API int RGBColorTable(uint8_t* dst_argb, int dst_stride_argb, const uint8_t* table_argb, int dst_x, int dst_y, int width, int height) { int y; void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb, int width) = RGBColorTableRow_C; uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) { return -1; } // Coalesce rows. if (dst_stride_argb == width * 4) { width *= height; height = 1; dst_stride_argb = 0; } #if defined(HAS_RGBCOLORTABLEROW_X86) if (TestCpuFlag(kCpuHasX86)) { RGBColorTableRow = RGBColorTableRow_X86; } #endif for (y = 0; y < height; ++y) { RGBColorTableRow(dst, table_argb, width); dst += dst_stride_argb; } return 0; } // ARGBQuantize is used to posterize art. // e.g. rgb / qvalue * qvalue + qvalue / 2 // But the low levels implement efficiently with 3 parameters, and could be // used for other high level operations. // dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; // where scale is 1 / interval_size as a fixed point value. // The divide is replaces with a multiply by reciprocal fixed point multiply. // Caveat - although SSE2 saturates, the C function does not and should be used // with care if doing anything but quantization. LIBYUV_API int ARGBQuantize(uint8_t* dst_argb, int dst_stride_argb, int scale, int interval_size, int interval_offset, int dst_x, int dst_y, int width, int height) { int y; void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size, int interval_offset, int width) = ARGBQuantizeRow_C; uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 || interval_size < 1 || interval_size > 255) { return -1; } // Coalesce rows. if (dst_stride_argb == width * 4) { width *= height; height = 1; dst_stride_argb = 0; } #if defined(HAS_ARGBQUANTIZEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { ARGBQuantizeRow = ARGBQuantizeRow_SSE2; } #endif #if defined(HAS_ARGBQUANTIZEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBQuantizeRow = ARGBQuantizeRow_NEON; } #endif #if defined(HAS_ARGBQUANTIZEROW_MSA) if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) { ARGBQuantizeRow = ARGBQuantizeRow_MSA; } #endif for (y = 0; y < height; ++y) { ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width); dst += dst_stride_argb; } return 0; } // Computes table of cumulative sum for image where the value is the sum // of all values above and to the left of the entry. Used by ARGBBlur. LIBYUV_API int ARGBComputeCumulativeSum(const uint8_t* src_argb, int src_stride_argb, int32_t* dst_cumsum, int dst_stride32_cumsum, int width, int height) { int y; void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width) = ComputeCumulativeSumRow_C; int32_t* previous_cumsum = dst_cumsum; if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) { return -1; } #if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; } #endif #if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI; } #endif memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4); // 4 int per pixel. for (y = 0; y < height; ++y) { ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width); previous_cumsum = dst_cumsum; dst_cumsum += dst_stride32_cumsum; src_argb += src_stride_argb; } return 0; } // Blur ARGB image. // Caller should allocate CumulativeSum table of width * height * 16 bytes // aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory // as the buffer is treated as circular. LIBYUV_API int ARGBBlur(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int32_t* dst_cumsum, int dst_stride32_cumsum, int width, int height, int radius) { int y; void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width) = ComputeCumulativeSumRow_C; void (*CumulativeSumToAverageRow)( const int32_t* topleft, const int32_t* botleft, int width, int area, uint8_t* dst, int count) = CumulativeSumToAverageRow_C; int32_t* cumsum_bot_row; int32_t* max_cumsum_bot_row; int32_t* cumsum_top_row; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } if (radius > height) { radius = height; } if (radius > (width / 2 - 1)) { radius = width / 2 - 1; } if (radius <= 0) { return -1; } #if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2; CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2; } #endif #if defined(HAS_CUMULATIVESUMTOAVERAGEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ComputeCumulativeSumRow = ComputeCumulativeSumRow_MMI; } #endif // Compute enough CumulativeSum for first row to be blurred. After this // one row of CumulativeSum is updated at a time. ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum, dst_stride32_cumsum, width, radius); src_argb = src_argb + radius * src_stride_argb; cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum]; max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum]; cumsum_top_row = &dst_cumsum[0]; for (y = 0; y < height; ++y) { int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0; int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1); int area = radius * (bot_y - top_y); int boxwidth = radius * 4; int x; int n; // Increment cumsum_top_row pointer with circular buffer wrap around. if (top_y) { cumsum_top_row += dst_stride32_cumsum; if (cumsum_top_row >= max_cumsum_bot_row) { cumsum_top_row = dst_cumsum; } } // Increment cumsum_bot_row pointer with circular buffer wrap around and // then fill in a row of CumulativeSum. if ((y + radius) < height) { const int32_t* prev_cumsum_bot_row = cumsum_bot_row; cumsum_bot_row += dst_stride32_cumsum; if (cumsum_bot_row >= max_cumsum_bot_row) { cumsum_bot_row = dst_cumsum; } ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row, width); src_argb += src_stride_argb; } // Left clipped. for (x = 0; x < radius + 1; ++x) { CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, &dst_argb[x * 4], 1); area += (bot_y - top_y); boxwidth += 4; } // Middle unclipped. n = (width - 1) - radius - x + 1; CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area, &dst_argb[x * 4], n); // Right clipped. for (x += n; x <= width - 1; ++x) { area -= (bot_y - top_y); boxwidth -= 4; CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4, cumsum_bot_row + (x - radius - 1) * 4, boxwidth, area, &dst_argb[x * 4], 1); } dst_argb += dst_stride_argb; } return 0; } // Multiply ARGB image by a specified ARGB value. LIBYUV_API int ARGBShade(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height, uint32_t value) { int y; void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) = ARGBShadeRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) { return -1; } if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBSHADEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { ARGBShadeRow = ARGBShadeRow_SSE2; } #endif #if defined(HAS_ARGBSHADEROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBShadeRow = ARGBShadeRow_NEON; } #endif #if defined(HAS_ARGBSHADEROW_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 2)) { ARGBShadeRow = ARGBShadeRow_MMI; } #endif #if defined(HAS_ARGBSHADEROW_MSA) if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) { ARGBShadeRow = ARGBShadeRow_MSA; } #endif for (y = 0; y < height; ++y) { ARGBShadeRow(src_argb, dst_argb, width, value); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } // Interpolate 2 planes by specified amount (0 to 255). LIBYUV_API int InterpolatePlane(const uint8_t* src0, int src_stride0, const uint8_t* src1, int src_stride1, uint8_t* dst, int dst_stride, int width, int height, int interpolation) { int y; void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; if (!src0 || !src1 || !dst || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; dst = dst + (height - 1) * dst_stride; dst_stride = -dst_stride; } // Coalesce rows. if (src_stride0 == width && src_stride1 == width && dst_stride == width) { width *= height; height = 1; src_stride0 = src_stride1 = dst_stride = 0; } #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(width, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; if (IS_ALIGNED(width, 8)) { InterpolateRow = InterpolateRow_MMI; } } #endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; if (IS_ALIGNED(width, 32)) { InterpolateRow = InterpolateRow_MSA; } } #endif for (y = 0; y < height; ++y) { InterpolateRow(dst, src0, src1 - src0, width, interpolation); src0 += src_stride0; src1 += src_stride1; dst += dst_stride; } return 0; } // Interpolate 2 ARGB images by specified amount (0 to 255). LIBYUV_API int ARGBInterpolate(const uint8_t* src_argb0, int src_stride_argb0, const uint8_t* src_argb1, int src_stride_argb1, uint8_t* dst_argb, int dst_stride_argb, int width, int height, int interpolation) { return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1, src_stride_argb1, dst_argb, dst_stride_argb, width * 4, height, interpolation); } // Interpolate 2 YUV images by specified amount (0 to 255). LIBYUV_API int I420Interpolate(const uint8_t* src0_y, int src0_stride_y, const uint8_t* src0_u, int src0_stride_u, const uint8_t* src0_v, int src0_stride_v, const uint8_t* src1_y, int src1_stride_y, const uint8_t* src1_u, int src1_stride_u, const uint8_t* src1_v, int src1_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, int interpolation) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y, dst_stride_y, width, height, interpolation); InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u, dst_stride_u, halfwidth, halfheight, interpolation); InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v, dst_stride_v, halfwidth, halfheight, interpolation); return 0; } // Shuffle ARGB channel order. e.g. BGRA to ARGB. LIBYUV_API int ARGBShuffle(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_argb, int dst_stride_argb, const uint8_t* shuffler, int width, int height) { int y; void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb, const uint8_t* shuffler, int width) = ARGBShuffleRow_C; if (!src_bgra || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_bgra = src_bgra + (height - 1) * src_stride_bgra; src_stride_bgra = -src_stride_bgra; } // Coalesce rows. if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_bgra = dst_stride_argb = 0; } #if defined(HAS_ARGBSHUFFLEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { ARGBShuffleRow = ARGBShuffleRow_SSSE3; } } #endif #if defined(HAS_ARGBSHUFFLEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBShuffleRow = ARGBShuffleRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { ARGBShuffleRow = ARGBShuffleRow_AVX2; } } #endif #if defined(HAS_ARGBSHUFFLEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBShuffleRow = ARGBShuffleRow_Any_NEON; if (IS_ALIGNED(width, 4)) { ARGBShuffleRow = ARGBShuffleRow_NEON; } } #endif #if defined(HAS_ARGBSHUFFLEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBShuffleRow = ARGBShuffleRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBShuffleRow = ARGBShuffleRow_MMI; } } #endif #if defined(HAS_ARGBSHUFFLEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBShuffleRow = ARGBShuffleRow_Any_MSA; if (IS_ALIGNED(width, 8)) { ARGBShuffleRow = ARGBShuffleRow_MSA; } } #endif for (y = 0; y < height; ++y) { ARGBShuffleRow(src_bgra, dst_argb, shuffler, width); src_bgra += src_stride_bgra; dst_argb += dst_stride_argb; } return 0; } // Shuffle AR64 channel order. e.g. AR64 to AB64. LIBYUV_API int AR64Shuffle(const uint16_t* src_ar64, int src_stride_ar64, uint16_t* dst_ar64, int dst_stride_ar64, const uint8_t* shuffler, int width, int height) { int y; void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64, const uint8_t* shuffler, int width) = AR64ShuffleRow_C; if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; src_stride_ar64 = -src_stride_ar64; } // Coalesce rows. if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) { width *= height; height = 1; src_stride_ar64 = dst_stride_ar64 = 0; } // Assembly versions can be reused if it's implemented with shuffle. #if defined(HAS_ARGBSHUFFLEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { AR64ShuffleRow = ARGBShuffleRow_SSSE3; } } #endif #if defined(HAS_ARGBSHUFFLEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { AR64ShuffleRow = ARGBShuffleRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { AR64ShuffleRow = ARGBShuffleRow_AVX2; } } #endif #if defined(HAS_ARGBSHUFFLEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { AR64ShuffleRow = ARGBShuffleRow_Any_NEON; if (IS_ALIGNED(width, 4)) { AR64ShuffleRow = ARGBShuffleRow_NEON; } } #endif #if defined(HAS_ARGBSHUFFLEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { AR64ShuffleRow = ARGBShuffleRow_Any_MMI; if (IS_ALIGNED(width, 2)) { AR64ShuffleRow = ARGBShuffleRow_MMI; } } #endif for (y = 0; y < height; ++y) { AR64ShuffleRow((uint8_t*)(src_ar64), (uint8_t*)(dst_ar64), shuffler, width * 2); src_ar64 += src_stride_ar64; dst_ar64 += dst_stride_ar64; } return 0; } // Gauss blur a float plane using Gaussian 5x5 filter with // coefficients of 1, 4, 6, 4, 1. // Each destination pixel is a blur of the 5x5 // pixels from the source. // Source edges are clamped. // Edge is 2 pixels on each side, and interior is multiple of 4. LIBYUV_API int GaussPlane_F32(const float* src, int src_stride, float* dst, int dst_stride, int width, int height) { int y; void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2, const float* src3, const float* src4, float* dst, int width) = GaussCol_F32_C; void (*GaussRow_F32)(const float* src, float* dst, int width) = GaussRow_F32_C; if (!src || !dst || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src = src + (height - 1) * src_stride; src_stride = -src_stride; } #if defined(HAS_GAUSSCOL_F32_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { GaussCol_F32 = GaussCol_F32_NEON; } #endif #if defined(HAS_GAUSSROW_F32_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { GaussRow_F32 = GaussRow_F32_NEON; } #endif { // 2 pixels on each side, but aligned out to 16 bytes. align_buffer_64(rowbuf, (4 + width + 4) * 4); memset(rowbuf, 0, 16); memset(rowbuf + (4 + width) * 4, 0, 16); float* row = (float*)(rowbuf + 16); const float* src0 = src; const float* src1 = src; const float* src2 = src; const float* src3 = src2 + ((height > 1) ? src_stride : 0); const float* src4 = src3 + ((height > 2) ? src_stride : 0); for (y = 0; y < height; ++y) { GaussCol_F32(src0, src1, src2, src3, src4, row, width); // Extrude edge by 2 floats row[-2] = row[-1] = row[0]; row[width + 1] = row[width] = row[width - 1]; GaussRow_F32(row - 2, dst, width); src0 = src1; src1 = src2; src2 = src3; src3 = src4; if ((y + 2) < (height - 1)) { src4 += src_stride; } dst += dst_stride; } free_aligned_buffer_64(rowbuf); } return 0; } // Sobel ARGB effect. static int ARGBSobelize(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height, void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst, int width)) { int y; void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) = ARGBToYJRow_C; void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) = SobelYRow_C; void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobely, int width) = SobelXRow_C; const int kEdge = 16; // Extra pixels at start of row for extrude/align. if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } #if defined(HAS_ARGBTOYJROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYJRow = ARGBToYJRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif #if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { ARGBToYJRow = ARGBToYJRow_AVX2; } } #endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_NEON; } } #endif #if defined(HAS_ARGBTOYJROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBToYJRow = ARGBToYJRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBToYJRow = ARGBToYJRow_MMI; } } #endif #if defined(HAS_ARGBTOYJROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBToYJRow = ARGBToYJRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBToYJRow = ARGBToYJRow_MSA; } } #endif #if defined(HAS_SOBELYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelYRow = SobelYRow_SSE2; } #endif #if defined(HAS_SOBELYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SobelYRow = SobelYRow_NEON; } #endif #if defined(HAS_SOBELYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SobelYRow = SobelYRow_MMI; } #endif #if defined(HAS_SOBELYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { SobelYRow = SobelYRow_MSA; } #endif #if defined(HAS_SOBELXROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXRow = SobelXRow_SSE2; } #endif #if defined(HAS_SOBELXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SobelXRow = SobelXRow_NEON; } #endif #if defined(HAS_SOBELXROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SobelXRow = SobelXRow_MMI; } #endif #if defined(HAS_SOBELXROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { SobelXRow = SobelXRow_MSA; } #endif { // 3 rows with edges before/after. const int kRowSize = (width + kEdge + 31) & ~31; align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); uint8_t* row_sobelx = rows; uint8_t* row_sobely = rows + kRowSize; uint8_t* row_y = rows + kRowSize * 2; // Convert first row. uint8_t* row_y0 = row_y + kEdge; uint8_t* row_y1 = row_y0 + kRowSize; uint8_t* row_y2 = row_y1 + kRowSize; ARGBToYJRow(src_argb, row_y0, width); row_y0[-1] = row_y0[0]; memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. ARGBToYJRow(src_argb, row_y1, width); row_y1[-1] = row_y1[0]; memset(row_y1 + width, row_y1[width - 1], 16); memset(row_y2 + width, 0, 16); for (y = 0; y < height; ++y) { // Convert next row of ARGB to G. if (y < (height - 1)) { src_argb += src_stride_argb; } ARGBToYJRow(src_argb, row_y2, width); row_y2[-1] = row_y2[0]; row_y2[width] = row_y2[width - 1]; SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width); SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width); SobelRow(row_sobelx, row_sobely, dst_argb, width); // Cycle thru circular queue of 3 row_y buffers. { uint8_t* row_yt = row_y0; row_y0 = row_y1; row_y1 = row_y2; row_y2 = row_yt; } dst_argb += dst_stride_argb; } free_aligned_buffer_64(rows); } return 0; } // Sobel ARGB effect. LIBYUV_API int ARGBSobel(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) = SobelRow_C; #if defined(HAS_SOBELROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelRow = SobelRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { SobelRow = SobelRow_SSE2; } } #endif #if defined(HAS_SOBELROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SobelRow = SobelRow_Any_NEON; if (IS_ALIGNED(width, 8)) { SobelRow = SobelRow_NEON; } } #endif #if defined(HAS_SOBELROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SobelRow = SobelRow_Any_MMI; if (IS_ALIGNED(width, 8)) { SobelRow = SobelRow_MMI; } } #endif #if defined(HAS_SOBELROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { SobelRow = SobelRow_Any_MSA; if (IS_ALIGNED(width, 16)) { SobelRow = SobelRow_MSA; } } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelRow); } // Sobel ARGB effect with planar output. LIBYUV_API int ARGBSobelToPlane(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_y, int dst_stride_y, int width, int height) { void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_, int width) = SobelToPlaneRow_C; #if defined(HAS_SOBELTOPLANEROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelToPlaneRow = SobelToPlaneRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { SobelToPlaneRow = SobelToPlaneRow_SSE2; } } #endif #if defined(HAS_SOBELTOPLANEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SobelToPlaneRow = SobelToPlaneRow_Any_NEON; if (IS_ALIGNED(width, 16)) { SobelToPlaneRow = SobelToPlaneRow_NEON; } } #endif #if defined(HAS_SOBELTOPLANEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SobelToPlaneRow = SobelToPlaneRow_Any_MMI; if (IS_ALIGNED(width, 8)) { SobelToPlaneRow = SobelToPlaneRow_MMI; } } #endif #if defined(HAS_SOBELTOPLANEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { SobelToPlaneRow = SobelToPlaneRow_Any_MSA; if (IS_ALIGNED(width, 32)) { SobelToPlaneRow = SobelToPlaneRow_MSA; } } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width, height, SobelToPlaneRow); } // SobelXY ARGB effect. // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. LIBYUV_API int ARGBSobelXY(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelXYRow = SobelXYRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { SobelXYRow = SobelXYRow_SSE2; } } #endif #if defined(HAS_SOBELXYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SobelXYRow = SobelXYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { SobelXYRow = SobelXYRow_NEON; } } #endif #if defined(HAS_SOBELXYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SobelXYRow = SobelXYRow_Any_MMI; if (IS_ALIGNED(width, 8)) { SobelXYRow = SobelXYRow_MMI; } } #endif #if defined(HAS_SOBELXYROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { SobelXYRow = SobelXYRow_Any_MSA; if (IS_ALIGNED(width, 16)) { SobelXYRow = SobelXYRow_MSA; } } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height, SobelXYRow); } // Apply a 4x4 polynomial to each ARGB pixel. LIBYUV_API int ARGBPolynomial(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, const float* poly, int width, int height) { int y; void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) = ARGBPolynomialRow_C; if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBPOLYNOMIALROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { ARGBPolynomialRow = ARGBPolynomialRow_SSE2; } #endif #if defined(HAS_ARGBPOLYNOMIALROW_AVX2) if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) && IS_ALIGNED(width, 2)) { ARGBPolynomialRow = ARGBPolynomialRow_AVX2; } #endif for (y = 0; y < height; ++y) { ARGBPolynomialRow(src_argb, dst_argb, poly, width); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } // Convert plane of 16 bit shorts to half floats. // Source values are multiplied by scale before storing as half float. LIBYUV_API int HalfFloatPlane(const uint16_t* src_y, int src_stride_y, uint16_t* dst_y, int dst_stride_y, float scale, int width, int height) { int y; void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale, int width) = HalfFloatRow_C; if (!src_y || !dst_y || width <= 0 || height == 0) { return -1; } src_stride_y >>= 1; dst_stride_y >>= 1; // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } // Coalesce rows. if (src_stride_y == width && dst_stride_y == width) { width *= height; height = 1; src_stride_y = dst_stride_y = 0; } #if defined(HAS_HALFFLOATROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { HalfFloatRow = HalfFloatRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { HalfFloatRow = HalfFloatRow_SSE2; } } #endif #if defined(HAS_HALFFLOATROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { HalfFloatRow = HalfFloatRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { HalfFloatRow = HalfFloatRow_AVX2; } } #endif #if defined(HAS_HALFFLOATROW_F16C) if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) { HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C; if (IS_ALIGNED(width, 16)) { HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C; } } #endif #if defined(HAS_HALFFLOATROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON; if (IS_ALIGNED(width, 8)) { HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON; } } #endif #if defined(HAS_HALFFLOATROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { HalfFloatRow = HalfFloatRow_Any_MSA; if (IS_ALIGNED(width, 32)) { HalfFloatRow = HalfFloatRow_MSA; } } #endif for (y = 0; y < height; ++y) { HalfFloatRow(src_y, dst_y, scale, width); src_y += src_stride_y; dst_y += dst_stride_y; } return 0; } // Convert a buffer of bytes to floats, scale the values and store as floats. LIBYUV_API int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) { void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale, int width) = ByteToFloatRow_C; if (!src_y || !dst_y || width <= 0) { return -1; } #if defined(HAS_BYTETOFLOATROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ByteToFloatRow = ByteToFloatRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ByteToFloatRow = ByteToFloatRow_NEON; } } #endif ByteToFloatRow(src_y, dst_y, scale, width); return 0; } // Apply a lumacolortable to each ARGB pixel. LIBYUV_API int ARGBLumaColorTable(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, const uint8_t* luma, int width, int height) { int y; void (*ARGBLumaColorTableRow)( const uint8_t* src_argb, uint8_t* dst_argb, int width, const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C; if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; } #endif for (y = 0; y < height; ++y) { ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } // Copy Alpha from one ARGB image to another. LIBYUV_API int ARGBCopyAlpha(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBCopyAlphaRow_C; if (!src_argb || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_argb = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYALPHAROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2; } } #endif #if defined(HAS_ARGBCOPYALPHAROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2; } } #endif #if defined(HAS_ARGBCOPYALPHAROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBCopyAlphaRow = ARGBCopyAlphaRow_MMI; } } #endif for (y = 0; y < height; ++y) { ARGBCopyAlphaRow(src_argb, dst_argb, width); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } return 0; } // Extract just the alpha channel from ARGB. LIBYUV_API int ARGBExtractAlpha(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_a, int dst_stride_a, int width, int height) { if (!src_argb || !dst_a || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb += (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_a == width) { width *= height; height = 1; src_stride_argb = dst_stride_a = 0; } void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a, int width) = ARGBExtractAlphaRow_C; #if defined(HAS_ARGBEXTRACTALPHAROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2 : ARGBExtractAlphaRow_Any_SSE2; } #endif #if defined(HAS_ARGBEXTRACTALPHAROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2 : ARGBExtractAlphaRow_Any_AVX2; } #endif #if defined(HAS_ARGBEXTRACTALPHAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON : ARGBExtractAlphaRow_Any_NEON; } #endif #if defined(HAS_ARGBEXTRACTALPHAROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_MMI : ARGBExtractAlphaRow_Any_MMI; } #endif #if defined(HAS_ARGBEXTRACTALPHAROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA : ARGBExtractAlphaRow_Any_MSA; } #endif for (int y = 0; y < height; ++y) { ARGBExtractAlphaRow(src_argb, dst_a, width); src_argb += src_stride_argb; dst_a += dst_stride_a; } return 0; } // Copy a planar Y channel to the alpha channel of a destination ARGB image. LIBYUV_API int ARGBCopyYToAlpha(const uint8_t* src_y, int src_stride_y, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int y; void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) = ARGBCopyYToAlphaRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } // Coalesce rows. if (src_stride_y == width && dst_stride_argb == width * 4) { width *= height; height = 1; src_stride_y = dst_stride_argb = 0; } #if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2; } } #endif #if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2; } } #endif #if defined(HAS_ARGBCOPYYTOALPHAROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_Any_MMI; if (IS_ALIGNED(width, 8)) { ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_MMI; } } #endif for (y = 0; y < height; ++y) { ARGBCopyYToAlphaRow(src_y, dst_argb, width); src_y += src_stride_y; dst_argb += dst_stride_argb; } return 0; } // TODO(fbarchard): Consider if width is even Y channel can be split // directly. A SplitUVRow_Odd function could copy the remaining chroma. LIBYUV_API int YUY2ToNV12(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; int halfwidth = (width + 1) >> 1; void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } #if defined(HAS_SPLITUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SplitUVRow = SplitUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { SplitUVRow = SplitUVRow_SSE2; } } #endif #if defined(HAS_SPLITUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { SplitUVRow = SplitUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { SplitUVRow = SplitUVRow_AVX2; } } #endif #if defined(HAS_SPLITUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SplitUVRow = SplitUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { SplitUVRow = SplitUVRow_NEON; } } #endif #if defined(HAS_SPLITUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SplitUVRow = SplitUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { SplitUVRow = SplitUVRow_MMI; } } #endif #if defined(HAS_SPLITUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { SplitUVRow = SplitUVRow_Any_MSA; if (IS_ALIGNED(width, 32)) { SplitUVRow = SplitUVRow_MSA; } } #endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(width, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; if (IS_ALIGNED(width, 8)) { InterpolateRow = InterpolateRow_MMI; } } #endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; if (IS_ALIGNED(width, 32)) { InterpolateRow = InterpolateRow_MSA; } } #endif { int awidth = halfwidth * 2; // row of y and 2 rows of uv align_buffer_64(rows, awidth * 3); for (y = 0; y < height - 1; y += 2) { // Split Y from UV. SplitUVRow(src_yuy2, rows, rows + awidth, awidth); memcpy(dst_y, rows, width); SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth); memcpy(dst_y + dst_stride_y, rows, width); InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128); src_yuy2 += src_stride_yuy2 * 2; dst_y += dst_stride_y * 2; dst_uv += dst_stride_uv; } if (height & 1) { // Split Y from UV. SplitUVRow(src_yuy2, rows, dst_uv, awidth); memcpy(dst_y, rows, width); } free_aligned_buffer_64(rows); } return 0; } LIBYUV_API int UYVYToNV12(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; int halfwidth = (width + 1) >> 1; void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) = SplitUVRow_C; void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } #if defined(HAS_SPLITUVROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SplitUVRow = SplitUVRow_Any_SSE2; if (IS_ALIGNED(width, 16)) { SplitUVRow = SplitUVRow_SSE2; } } #endif #if defined(HAS_SPLITUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { SplitUVRow = SplitUVRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { SplitUVRow = SplitUVRow_AVX2; } } #endif #if defined(HAS_SPLITUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SplitUVRow = SplitUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { SplitUVRow = SplitUVRow_NEON; } } #endif #if defined(HAS_SPLITUVROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { SplitUVRow = SplitUVRow_Any_MMI; if (IS_ALIGNED(width, 8)) { SplitUVRow = SplitUVRow_MMI; } } #endif #if defined(HAS_SPLITUVROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { SplitUVRow = SplitUVRow_Any_MSA; if (IS_ALIGNED(width, 32)) { SplitUVRow = SplitUVRow_MSA; } } #endif #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(width, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; if (IS_ALIGNED(width, 8)) { InterpolateRow = InterpolateRow_MMI; } } #endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; if (IS_ALIGNED(width, 32)) { InterpolateRow = InterpolateRow_MSA; } } #endif { int awidth = halfwidth * 2; // row of y and 2 rows of uv align_buffer_64(rows, awidth * 3); for (y = 0; y < height - 1; y += 2) { // Split Y from UV. SplitUVRow(src_uyvy, rows + awidth, rows, awidth); memcpy(dst_y, rows, width); SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth * 2, rows, awidth); memcpy(dst_y + dst_stride_y, rows, width); InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128); src_uyvy += src_stride_uyvy * 2; dst_y += dst_stride_y * 2; dst_uv += dst_stride_uv; } if (height & 1) { // Split Y from UV. SplitUVRow(src_uyvy, dst_uv, rows, awidth); memcpy(dst_y, rows, width); } free_aligned_buffer_64(rows); } return 0; } // width and height are src size allowing odd size handling. LIBYUV_API void HalfMergeUVPlane(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { int y; void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int width) = HalfMergeUVRow_C; // Negative height means invert the image. if (height < 0) { height = -height; src_u = src_u + (height - 1) * src_stride_u; src_v = src_v + (height - 1) * src_stride_v; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } #if defined(HAS_HALFMERGEUVROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { HalfMergeUVRow = HalfMergeUVRow_NEON; } #endif #if defined(HAS_HALFMERGEUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { HalfMergeUVRow = HalfMergeUVRow_SSSE3; } #endif #if defined(HAS_HALFMERGEUVROW_AVX2) if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { HalfMergeUVRow = HalfMergeUVRow_AVX2; } #endif for (y = 0; y < height - 1; y += 2) { // Merge a row of U and V into a row of UV. HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width); src_u += src_stride_u * 2; src_v += src_stride_v * 2; dst_uv += dst_stride_uv; } if (height & 1) { HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width); } } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/rotate.cc000066400000000000000000000556511416500237200206470ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/rotate.h" #include "libyuv/convert.h" #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate_row.h" #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif LIBYUV_API void TransposePlane(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height) { int i = height; #if defined(HAS_TRANSPOSEWX16_MSA) void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) = TransposeWx16_C; #else void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) = TransposeWx8_C; #endif #if defined(HAS_TRANSPOSEWX16_MSA) if (TestCpuFlag(kCpuHasMSA)) { TransposeWx16 = TransposeWx16_Any_MSA; if (IS_ALIGNED(width, 16)) { TransposeWx16 = TransposeWx16_MSA; } } #else #if defined(HAS_TRANSPOSEWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeWx8 = TransposeWx8_NEON; } #endif #if defined(HAS_TRANSPOSEWX8_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { TransposeWx8 = TransposeWx8_Any_SSSE3; if (IS_ALIGNED(width, 8)) { TransposeWx8 = TransposeWx8_SSSE3; } } #endif #if defined(HAS_TRANSPOSEWX8_MMI) if (TestCpuFlag(kCpuHasMMI)) { TransposeWx8 = TransposeWx8_MMI; } #endif #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; if (IS_ALIGNED(width, 16)) { TransposeWx8 = TransposeWx8_Fast_SSSE3; } } #endif #endif /* defined(HAS_TRANSPOSEWX16_MSA) */ #if defined(HAS_TRANSPOSEWX16_MSA) // Work across the source in 16x16 tiles while (i >= 16) { TransposeWx16(src, src_stride, dst, dst_stride, width); src += 16 * src_stride; // Go down 16 rows. dst += 16; // Move over 16 columns. i -= 16; } #else // Work across the source in 8x8 tiles while (i >= 8) { TransposeWx8(src, src_stride, dst, dst_stride, width); src += 8 * src_stride; // Go down 8 rows. dst += 8; // Move over 8 columns. i -= 8; } #endif if (i > 0) { TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); } } LIBYUV_API void RotatePlane90(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height) { // Rotate by 90 is a transpose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. src += src_stride * (height - 1); src_stride = -src_stride; TransposePlane(src, src_stride, dst, dst_stride, width, height); } LIBYUV_API void RotatePlane270(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height) { // Rotate by 270 is a transpose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. dst += dst_stride * (width - 1); dst_stride = -dst_stride; TransposePlane(src, src_stride, dst, dst_stride, width, height); } LIBYUV_API void RotatePlane180(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width); const uint8_t* src_bot = src + src_stride * (height - 1); uint8_t* dst_bot = dst + dst_stride * (height - 1); int half_height = (height + 1) >> 1; int y; void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C; #if defined(HAS_MIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MirrorRow = MirrorRow_Any_NEON; if (IS_ALIGNED(width, 32)) { MirrorRow = MirrorRow_NEON; } } #endif #if defined(HAS_MIRRORROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { MirrorRow = MirrorRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { MirrorRow = MirrorRow_SSSE3; } } #endif #if defined(HAS_MIRRORROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { MirrorRow = MirrorRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { MirrorRow = MirrorRow_AVX2; } } #endif #if defined(HAS_MIRRORROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { MirrorRow = MirrorRow_Any_MMI; if (IS_ALIGNED(width, 8)) { MirrorRow = MirrorRow_MMI; } } #endif #if defined(HAS_MIRRORROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { MirrorRow = MirrorRow_Any_MSA; if (IS_ALIGNED(width, 64)) { MirrorRow = MirrorRow_MSA; } } #endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } #endif #if defined(HAS_COPYROW_AVX) if (TestCpuFlag(kCpuHasAVX)) { CopyRow = IS_ALIGNED(width, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; } #endif #if defined(HAS_COPYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif #if defined(HAS_COPYROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { CopyRow = IS_ALIGNED(width, 8) ? CopyRow_MMI : CopyRow_Any_MMI; } #endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { CopyRow(src, row, width); // Copy first row into buffer MirrorRow(src_bot, dst, width); // Mirror last row into first row MirrorRow(row, dst_bot, width); // Mirror buffer into last row src += src_stride; dst += dst_stride; src_bot -= src_stride; dst_bot -= dst_stride; } free_aligned_buffer_64(row); } LIBYUV_API void SplitTransposeUV(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width, int height) { int i = height; #if defined(HAS_TRANSPOSEUVWX16_MSA) void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) = TransposeUVWx16_C; #else void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) = TransposeUVWx8_C; #endif #if defined(HAS_TRANSPOSEUVWX16_MSA) if (TestCpuFlag(kCpuHasMSA)) { TransposeUVWx16 = TransposeUVWx16_Any_MSA; if (IS_ALIGNED(width, 8)) { TransposeUVWx16 = TransposeUVWx16_MSA; } } #else #if defined(HAS_TRANSPOSEUVWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeUVWx8 = TransposeUVWx8_NEON; } #endif #if defined(HAS_TRANSPOSEUVWX8_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { TransposeUVWx8 = TransposeUVWx8_Any_SSE2; if (IS_ALIGNED(width, 8)) { TransposeUVWx8 = TransposeUVWx8_SSE2; } } #endif #if defined(HAS_TRANSPOSEUVWX8_MMI) if (TestCpuFlag(kCpuHasMMI)) { TransposeUVWx8 = TransposeUVWx8_Any_MMI; if (IS_ALIGNED(width, 4)) { TransposeUVWx8 = TransposeUVWx8_MMI; } } #endif #endif /* defined(HAS_TRANSPOSEUVWX16_MSA) */ #if defined(HAS_TRANSPOSEUVWX16_MSA) // Work through the source in 8x8 tiles. while (i >= 16) { TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width); src += 16 * src_stride; // Go down 16 rows. dst_a += 16; // Move over 8 columns. dst_b += 16; // Move over 8 columns. i -= 16; } #else // Work through the source in 8x8 tiles. while (i >= 8) { TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width); src += 8 * src_stride; // Go down 8 rows. dst_a += 8; // Move over 8 columns. dst_b += 8; // Move over 8 columns. i -= 8; } #endif if (i > 0) { TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, i); } } LIBYUV_API void SplitRotateUV90(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width, int height) { src += src_stride * (height - 1); src_stride = -src_stride; SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, height); } LIBYUV_API void SplitRotateUV270(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width, int height) { dst_a += dst_stride_a * (width - 1); dst_b += dst_stride_b * (width - 1); dst_stride_a = -dst_stride_a; dst_stride_b = -dst_stride_b; SplitTransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width, height); } // Rotate 180 is a horizontal and vertical flip. LIBYUV_API void SplitRotateUV180(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width, int height) { int i; void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, int width) = MirrorSplitUVRow_C; #if defined(HAS_MIRRORSPLITUVROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { MirrorSplitUVRow = MirrorSplitUVRow_NEON; } #endif #if defined(HAS_MIRRORSPLITUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { MirrorSplitUVRow = MirrorSplitUVRow_SSSE3; } #endif #if defined(HAS_MIRRORSPLITUVROW_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(width, 8)) { MirrorSplitUVRow = MirrorSplitUVRow_MMI; } #endif #if defined(HAS_MIRRORSPLITUVROW_MSA) if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) { MirrorSplitUVRow = MirrorSplitUVRow_MSA; } #endif dst_a += dst_stride_a * (height - 1); dst_b += dst_stride_b * (height - 1); for (i = 0; i < height; ++i) { MirrorSplitUVRow(src, dst_a, dst_b, width); src += src_stride; dst_a -= dst_stride_a; dst_b -= dst_stride_b; } } // Rotate UV and split into planar. // width and height expected to be half size for NV12 LIBYUV_API int SplitRotateUV(const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, enum RotationMode mode) { if (!src_uv || width <= 0 || height == 0 || !dst_u || !dst_v) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_uv = src_uv + (height - 1) * src_stride_uv; src_stride_uv = -src_stride_uv; } switch (mode) { case kRotate0: SplitUVPlane(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); return 0; case kRotate90: SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); return 0; case kRotate270: SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); return 0; case kRotate180: SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); return 0; default: break; } return -1; } LIBYUV_API int RotatePlane(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height, enum RotationMode mode) { if (!src || width <= 0 || height == 0 || !dst) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src = src + (height - 1) * src_stride; src_stride = -src_stride; } switch (mode) { case kRotate0: // copy frame CopyPlane(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate90: RotatePlane90(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate270: RotatePlane270(src, src_stride, dst, dst_stride, width, height); return 0; case kRotate180: RotatePlane180(src, src_stride, dst, dst_stride, width, height); return 0; default: break; } return -1; } LIBYUV_API int I420Rotate(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, enum RotationMode mode) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || !dst_u || !dst_v) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (halfheight - 1) * src_stride_u; src_v = src_v + (halfheight - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } switch (mode) { case kRotate0: // copy frame return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); case kRotate90: RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; case kRotate270: RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; case kRotate180: RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); return 0; default: break; } return -1; } LIBYUV_API int I444Rotate(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, enum libyuv::RotationMode mode) { if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y || !dst_u || !dst_v) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (height - 1) * src_stride_u; src_v = src_v + (height - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } switch (mode) { case libyuv::kRotate0: // copy frame CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height); CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height); return 0; case libyuv::kRotate90: RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height); RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height); return 0; case libyuv::kRotate270: RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height); RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height); return 0; case libyuv::kRotate180: RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height); RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height); return 0; default: break; } return -1; } LIBYUV_API int NV12ToI420Rotate(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, enum RotationMode mode) { int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u || !dst_v) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_uv = src_uv + (halfheight - 1) * src_stride_uv; src_stride_y = -src_stride_y; src_stride_uv = -src_stride_uv; } switch (mode) { case kRotate0: // copy frame return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, width, height); case kRotate90: RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height); SplitRotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, halfwidth, halfheight); return 0; case kRotate270: RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height); SplitRotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, halfwidth, halfheight); return 0; case kRotate180: RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height); SplitRotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v, dst_stride_v, halfwidth, halfheight); return 0; default: break; } return -1; } static void SplitPixels(const uint8_t* src_u, int src_pixel_stride_uv, uint8_t* dst_u, int width) { int i; for (i = 0; i < width; ++i) { *dst_u = *src_u; ++dst_u; src_u += src_pixel_stride_uv; } } // Convert Android420 to I420 with Rotate LIBYUV_API int Android420ToI420Rotate(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_pixel_stride_uv, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int width, int height, enum RotationMode rotation) { int y; const ptrdiff_t vu_off = src_v - src_u; int halfwidth = (width + 1) >> 1; int halfheight = (height + 1) >> 1; if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; halfheight = (height + 1) >> 1; src_y = src_y + (height - 1) * src_stride_y; src_u = src_u + (halfheight - 1) * src_stride_u; src_v = src_v + (halfheight - 1) * src_stride_v; src_stride_y = -src_stride_y; src_stride_u = -src_stride_u; src_stride_v = -src_stride_v; } if (dst_y) { RotatePlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height, rotation); } // Copy UV planes - I420 if (src_pixel_stride_uv == 1) { RotatePlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight, rotation); RotatePlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight, rotation); return 0; } // Split UV planes - NV21 if (src_pixel_stride_uv == 2 && vu_off == -1 && src_stride_u == src_stride_v) { SplitRotateUV(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u, halfwidth, halfheight, rotation); return 0; } // Split UV planes - NV12 if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) { SplitRotateUV(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v, halfwidth, halfheight, rotation); return 0; } if (rotation == 0) { for (y = 0; y < halfheight; ++y) { SplitPixels(src_u, src_pixel_stride_uv, dst_u, halfwidth); SplitPixels(src_v, src_pixel_stride_uv, dst_v, halfwidth); src_u += src_stride_u; src_v += src_stride_v; dst_u += dst_stride_u; dst_v += dst_stride_v; } return 0; } // unsupported type and/or rotation. return -1; } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/rotate_any.cc000066400000000000000000000060321416500237200215030ustar00rootroot00000000000000/* * Copyright 2015 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/rotate.h" #include "libyuv/rotate_row.h" #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #define TANY(NAMEANY, TPOS_SIMD, MASK) \ void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst, \ int dst_stride, int width) { \ int r = width & MASK; \ int n = width - r; \ if (n > 0) { \ TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ } \ TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ } #ifdef HAS_TRANSPOSEWX8_NEON TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7) #endif #ifdef HAS_TRANSPOSEWX8_SSSE3 TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7) #endif #ifdef HAS_TRANSPOSEWX8_MMI TANY(TransposeWx8_Any_MMI, TransposeWx8_MMI, 7) #endif #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15) #endif #ifdef HAS_TRANSPOSEWX16_MSA TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15) #endif #undef TANY #define TUVANY(NAMEANY, TPOS_SIMD, MASK) \ void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a, \ int dst_stride_a, uint8_t* dst_b, int dst_stride_b, \ int width) { \ int r = width & MASK; \ int n = width - r; \ if (n > 0) { \ TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \ } \ TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a, \ dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \ } #ifdef HAS_TRANSPOSEUVWX8_NEON TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7) #endif #ifdef HAS_TRANSPOSEUVWX8_SSE2 TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7) #endif #ifdef HAS_TRANSPOSEUVWX8_MMI TUVANY(TransposeUVWx8_Any_MMI, TransposeUVWx8_MMI, 7) #endif #ifdef HAS_TRANSPOSEUVWX16_MSA TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7) #endif #undef TUVANY #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/rotate_argb.cc000066400000000000000000000172631416500237200216370ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/rotate.h" #include "libyuv/convert.h" #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/row.h" #include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */ #ifdef __cplusplus namespace libyuv { extern "C" { #endif static int ARGBTranspose(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { int i; int src_pixel_step = src_stride_argb >> 2; void (*ScaleARGBRowDownEven)( const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step, uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C; // Check stride is a multiple of 4. if (src_stride_argb & 3) { return -1; } #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2; if (IS_ALIGNED(height, 4)) { // Width of dest. ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2; } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON; if (IS_ALIGNED(height, 4)) { // Width of dest. ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON; } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MMI; if (IS_ALIGNED(height, 4)) { // Width of dest. ScaleARGBRowDownEven = ScaleARGBRowDownEven_MMI; } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA; if (IS_ALIGNED(height, 4)) { // Width of dest. ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA; } } #endif for (i = 0; i < width; ++i) { // column of source to row of dest. ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height); dst_argb += dst_stride_argb; src_argb += 4; } return 0; } static int ARGBRotate90(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { // Rotate by 90 is a ARGBTranspose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. src_argb += src_stride_argb * (height - 1); src_stride_argb = -src_stride_argb; return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height); } static int ARGBRotate270(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { // Rotate by 270 is a ARGBTranspose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. dst_argb += dst_stride_argb * (width - 1); dst_stride_argb = -dst_stride_argb; return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height); } static int ARGBRotate180(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width * 4); const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1); uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1); int half_height = (height + 1) >> 1; int y; void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = ARGBMirrorRow_C; void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) = CopyRow_C; #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBMirrorRow = ARGBMirrorRow_NEON; } } #endif #if defined(HAS_ARGBMIRRORROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGBMirrorRow = ARGBMirrorRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { ARGBMirrorRow = ARGBMirrorRow_SSE2; } } #endif #if defined(HAS_ARGBMIRRORROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBMirrorRow = ARGBMirrorRow_Any_AVX2; if (IS_ALIGNED(width, 8)) { ARGBMirrorRow = ARGBMirrorRow_AVX2; } } #endif #if defined(HAS_ARGBMIRRORROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ARGBMirrorRow = ARGBMirrorRow_Any_MMI; if (IS_ALIGNED(width, 2)) { ARGBMirrorRow = ARGBMirrorRow_MMI; } } #endif #if defined(HAS_ARGBMIRRORROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ARGBMirrorRow = ARGBMirrorRow_Any_MSA; if (IS_ALIGNED(width, 16)) { ARGBMirrorRow = ARGBMirrorRow_MSA; } } #endif #if defined(HAS_COPYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2; } #endif #if defined(HAS_COPYROW_AVX) if (TestCpuFlag(kCpuHasAVX)) { CopyRow = IS_ALIGNED(width * 4, 64) ? CopyRow_AVX : CopyRow_Any_AVX; } #endif #if defined(HAS_COPYROW_ERMS) if (TestCpuFlag(kCpuHasERMS)) { CopyRow = CopyRow_ERMS; } #endif #if defined(HAS_COPYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON; } #endif // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { ARGBMirrorRow(src_argb, row, width); // Mirror first row into a buffer ARGBMirrorRow(src_bot, dst_argb, width); // Mirror last row into first row CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last src_argb += src_stride_argb; dst_argb += dst_stride_argb; src_bot -= src_stride_argb; dst_bot -= dst_stride_argb; } free_aligned_buffer_64(row); return 0; } LIBYUV_API int ARGBRotate(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_argb, int dst_stride_argb, int width, int height, enum RotationMode mode) { if (!src_argb || width <= 0 || height == 0 || !dst_argb) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } switch (mode) { case kRotate0: // copy frame return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height); case kRotate90: return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height); case kRotate270: return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height); case kRotate180: return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width, height); default: break; } return -1; } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/rotate_common.cc000066400000000000000000000060061416500237200222050ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/rotate_row.h" #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif void TransposeWx8_C(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) { int i; for (i = 0; i < width; ++i) { dst[0] = src[0 * src_stride]; dst[1] = src[1 * src_stride]; dst[2] = src[2 * src_stride]; dst[3] = src[3 * src_stride]; dst[4] = src[4 * src_stride]; dst[5] = src[5 * src_stride]; dst[6] = src[6 * src_stride]; dst[7] = src[7 * src_stride]; ++src; dst += dst_stride; } } void TransposeUVWx8_C(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) { int i; for (i = 0; i < width; ++i) { dst_a[0] = src[0 * src_stride + 0]; dst_b[0] = src[0 * src_stride + 1]; dst_a[1] = src[1 * src_stride + 0]; dst_b[1] = src[1 * src_stride + 1]; dst_a[2] = src[2 * src_stride + 0]; dst_b[2] = src[2 * src_stride + 1]; dst_a[3] = src[3 * src_stride + 0]; dst_b[3] = src[3 * src_stride + 1]; dst_a[4] = src[4 * src_stride + 0]; dst_b[4] = src[4 * src_stride + 1]; dst_a[5] = src[5 * src_stride + 0]; dst_b[5] = src[5 * src_stride + 1]; dst_a[6] = src[6 * src_stride + 0]; dst_b[6] = src[6 * src_stride + 1]; dst_a[7] = src[7 * src_stride + 0]; dst_b[7] = src[7 * src_stride + 1]; src += 2; dst_a += dst_stride_a; dst_b += dst_stride_b; } } void TransposeWxH_C(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width, int height) { int i; for (i = 0; i < width; ++i) { int j; for (j = 0; j < height; ++j) { dst[i * dst_stride + j] = src[j * src_stride + i]; } } } void TransposeUVWxH_C(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width, int height) { int i; for (i = 0; i < width * 2; i += 2) { int j; for (j = 0; j < height; ++j) { dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; } } } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/rotate_gcc.cc000066400000000000000000000431741416500237200214600ustar00rootroot00000000000000/* * Copyright 2015 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/rotate_row.h" #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for GCC x86 and x64. #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. #if defined(HAS_TRANSPOSEWX8_SSSE3) void TransposeWx8_SSSE3(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) { asm volatile( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" "movq (%0,%3),%%xmm1 \n" "lea (%0,%3,2),%0 \n" "punpcklbw %%xmm1,%%xmm0 \n" "movq (%0),%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "palignr $0x8,%%xmm1,%%xmm1 \n" "movq (%0,%3),%%xmm3 \n" "lea (%0,%3,2),%0 \n" "punpcklbw %%xmm3,%%xmm2 \n" "movdqa %%xmm2,%%xmm3 \n" "movq (%0),%%xmm4 \n" "palignr $0x8,%%xmm3,%%xmm3 \n" "movq (%0,%3),%%xmm5 \n" "lea (%0,%3,2),%0 \n" "punpcklbw %%xmm5,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" "movq (%0),%%xmm6 \n" "palignr $0x8,%%xmm5,%%xmm5 \n" "movq (%0,%3),%%xmm7 \n" "lea (%0,%3,2),%0 \n" "punpcklbw %%xmm7,%%xmm6 \n" "neg %3 \n" "movdqa %%xmm6,%%xmm7 \n" "lea 0x8(%0,%3,8),%0 \n" "palignr $0x8,%%xmm7,%%xmm7 \n" "neg %3 \n" // Second round of bit swap. "punpcklwd %%xmm2,%%xmm0 \n" "punpcklwd %%xmm3,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm1,%%xmm3 \n" "palignr $0x8,%%xmm2,%%xmm2 \n" "palignr $0x8,%%xmm3,%%xmm3 \n" "punpcklwd %%xmm6,%%xmm4 \n" "punpcklwd %%xmm7,%%xmm5 \n" "movdqa %%xmm4,%%xmm6 \n" "movdqa %%xmm5,%%xmm7 \n" "palignr $0x8,%%xmm6,%%xmm6 \n" "palignr $0x8,%%xmm7,%%xmm7 \n" // Third round of bit swap. // Write to the destination pointer. "punpckldq %%xmm4,%%xmm0 \n" "movq %%xmm0,(%1) \n" "movdqa %%xmm0,%%xmm4 \n" "palignr $0x8,%%xmm4,%%xmm4 \n" "movq %%xmm4,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "punpckldq %%xmm6,%%xmm2 \n" "movdqa %%xmm2,%%xmm6 \n" "movq %%xmm2,(%1) \n" "palignr $0x8,%%xmm6,%%xmm6 \n" "punpckldq %%xmm5,%%xmm1 \n" "movq %%xmm6,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "movdqa %%xmm1,%%xmm5 \n" "movq %%xmm1,(%1) \n" "palignr $0x8,%%xmm5,%%xmm5 \n" "movq %%xmm5,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "punpckldq %%xmm7,%%xmm3 \n" "movq %%xmm3,(%1) \n" "movdqa %%xmm3,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n" "sub $0x8,%2 \n" "movq %%xmm7,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // defined(HAS_TRANSPOSEWX8_SSSE3) // Transpose 16x8. 64 bit #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) void TransposeWx8_Fast_SSSE3(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) { asm volatile( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%0,%3),%%xmm1 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm0,%%xmm8 \n" "punpcklbw %%xmm1,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm8 \n" "movdqu (%0),%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm8,%%xmm9 \n" "palignr $0x8,%%xmm1,%%xmm1 \n" "palignr $0x8,%%xmm9,%%xmm9 \n" "movdqu (%0,%3),%%xmm3 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm2,%%xmm10 \n" "punpcklbw %%xmm3,%%xmm2 \n" "punpckhbw %%xmm3,%%xmm10 \n" "movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm10,%%xmm11 \n" "movdqu (%0),%%xmm4 \n" "palignr $0x8,%%xmm3,%%xmm3 \n" "palignr $0x8,%%xmm11,%%xmm11 \n" "movdqu (%0,%3),%%xmm5 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm4,%%xmm12 \n" "punpcklbw %%xmm5,%%xmm4 \n" "punpckhbw %%xmm5,%%xmm12 \n" "movdqa %%xmm4,%%xmm5 \n" "movdqa %%xmm12,%%xmm13 \n" "movdqu (%0),%%xmm6 \n" "palignr $0x8,%%xmm5,%%xmm5 \n" "palignr $0x8,%%xmm13,%%xmm13 \n" "movdqu (%0,%3),%%xmm7 \n" "lea (%0,%3,2),%0 \n" "movdqa %%xmm6,%%xmm14 \n" "punpcklbw %%xmm7,%%xmm6 \n" "punpckhbw %%xmm7,%%xmm14 \n" "neg %3 \n" "movdqa %%xmm6,%%xmm7 \n" "movdqa %%xmm14,%%xmm15 \n" "lea 0x10(%0,%3,8),%0 \n" "palignr $0x8,%%xmm7,%%xmm7 \n" "palignr $0x8,%%xmm15,%%xmm15 \n" "neg %3 \n" // Second round of bit swap. "punpcklwd %%xmm2,%%xmm0 \n" "punpcklwd %%xmm3,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm1,%%xmm3 \n" "palignr $0x8,%%xmm2,%%xmm2 \n" "palignr $0x8,%%xmm3,%%xmm3 \n" "punpcklwd %%xmm6,%%xmm4 \n" "punpcklwd %%xmm7,%%xmm5 \n" "movdqa %%xmm4,%%xmm6 \n" "movdqa %%xmm5,%%xmm7 \n" "palignr $0x8,%%xmm6,%%xmm6 \n" "palignr $0x8,%%xmm7,%%xmm7 \n" "punpcklwd %%xmm10,%%xmm8 \n" "punpcklwd %%xmm11,%%xmm9 \n" "movdqa %%xmm8,%%xmm10 \n" "movdqa %%xmm9,%%xmm11 \n" "palignr $0x8,%%xmm10,%%xmm10 \n" "palignr $0x8,%%xmm11,%%xmm11 \n" "punpcklwd %%xmm14,%%xmm12 \n" "punpcklwd %%xmm15,%%xmm13 \n" "movdqa %%xmm12,%%xmm14 \n" "movdqa %%xmm13,%%xmm15 \n" "palignr $0x8,%%xmm14,%%xmm14 \n" "palignr $0x8,%%xmm15,%%xmm15 \n" // Third round of bit swap. // Write to the destination pointer. "punpckldq %%xmm4,%%xmm0 \n" "movq %%xmm0,(%1) \n" "movdqa %%xmm0,%%xmm4 \n" "palignr $0x8,%%xmm4,%%xmm4 \n" "movq %%xmm4,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "punpckldq %%xmm6,%%xmm2 \n" "movdqa %%xmm2,%%xmm6 \n" "movq %%xmm2,(%1) \n" "palignr $0x8,%%xmm6,%%xmm6 \n" "punpckldq %%xmm5,%%xmm1 \n" "movq %%xmm6,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "movdqa %%xmm1,%%xmm5 \n" "movq %%xmm1,(%1) \n" "palignr $0x8,%%xmm5,%%xmm5 \n" "movq %%xmm5,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "punpckldq %%xmm7,%%xmm3 \n" "movq %%xmm3,(%1) \n" "movdqa %%xmm3,%%xmm7 \n" "palignr $0x8,%%xmm7,%%xmm7 \n" "movq %%xmm7,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "punpckldq %%xmm12,%%xmm8 \n" "movq %%xmm8,(%1) \n" "movdqa %%xmm8,%%xmm12 \n" "palignr $0x8,%%xmm12,%%xmm12 \n" "movq %%xmm12,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "punpckldq %%xmm14,%%xmm10 \n" "movdqa %%xmm10,%%xmm14 \n" "movq %%xmm10,(%1) \n" "palignr $0x8,%%xmm14,%%xmm14 \n" "punpckldq %%xmm13,%%xmm9 \n" "movq %%xmm14,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "movdqa %%xmm9,%%xmm13 \n" "movq %%xmm9,(%1) \n" "palignr $0x8,%%xmm13,%%xmm13 \n" "movq %%xmm13,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "punpckldq %%xmm15,%%xmm11 \n" "movq %%xmm11,(%1) \n" "movdqa %%xmm11,%%xmm15 \n" "palignr $0x8,%%xmm15,%%xmm15 \n" "sub $0x10,%2 \n" "movq %%xmm15,(%1,%4) \n" "lea (%1,%4,2),%1 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"); } #endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) // Transpose UV 8x8. 64 bit. #if defined(HAS_TRANSPOSEUVWX8_SSE2) void TransposeUVWx8_SSE2(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) { asm volatile( // Read in the data from the source pointer. // First round of bit swap. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%0,%4),%%xmm1 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm0,%%xmm8 \n" "punpcklbw %%xmm1,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm8 \n" "movdqa %%xmm8,%%xmm1 \n" "movdqu (%0),%%xmm2 \n" "movdqu (%0,%4),%%xmm3 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm2,%%xmm8 \n" "punpcklbw %%xmm3,%%xmm2 \n" "punpckhbw %%xmm3,%%xmm8 \n" "movdqa %%xmm8,%%xmm3 \n" "movdqu (%0),%%xmm4 \n" "movdqu (%0,%4),%%xmm5 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm4,%%xmm8 \n" "punpcklbw %%xmm5,%%xmm4 \n" "punpckhbw %%xmm5,%%xmm8 \n" "movdqa %%xmm8,%%xmm5 \n" "movdqu (%0),%%xmm6 \n" "movdqu (%0,%4),%%xmm7 \n" "lea (%0,%4,2),%0 \n" "movdqa %%xmm6,%%xmm8 \n" "punpcklbw %%xmm7,%%xmm6 \n" "neg %4 \n" "lea 0x10(%0,%4,8),%0 \n" "punpckhbw %%xmm7,%%xmm8 \n" "movdqa %%xmm8,%%xmm7 \n" "neg %4 \n" // Second round of bit swap. "movdqa %%xmm0,%%xmm8 \n" "movdqa %%xmm1,%%xmm9 \n" "punpckhwd %%xmm2,%%xmm8 \n" "punpckhwd %%xmm3,%%xmm9 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpcklwd %%xmm3,%%xmm1 \n" "movdqa %%xmm8,%%xmm2 \n" "movdqa %%xmm9,%%xmm3 \n" "movdqa %%xmm4,%%xmm8 \n" "movdqa %%xmm5,%%xmm9 \n" "punpckhwd %%xmm6,%%xmm8 \n" "punpckhwd %%xmm7,%%xmm9 \n" "punpcklwd %%xmm6,%%xmm4 \n" "punpcklwd %%xmm7,%%xmm5 \n" "movdqa %%xmm8,%%xmm6 \n" "movdqa %%xmm9,%%xmm7 \n" // Third round of bit swap. // Write to the destination pointer. "movdqa %%xmm0,%%xmm8 \n" "punpckldq %%xmm4,%%xmm0 \n" "movlpd %%xmm0,(%1) \n" // Write back U channel "movhpd %%xmm0,(%2) \n" // Write back V channel "punpckhdq %%xmm4,%%xmm8 \n" "movlpd %%xmm8,(%1,%5) \n" "lea (%1,%5,2),%1 \n" "movhpd %%xmm8,(%2,%6) \n" "lea (%2,%6,2),%2 \n" "movdqa %%xmm2,%%xmm8 \n" "punpckldq %%xmm6,%%xmm2 \n" "movlpd %%xmm2,(%1) \n" "movhpd %%xmm2,(%2) \n" "punpckhdq %%xmm6,%%xmm8 \n" "movlpd %%xmm8,(%1,%5) \n" "lea (%1,%5,2),%1 \n" "movhpd %%xmm8,(%2,%6) \n" "lea (%2,%6,2),%2 \n" "movdqa %%xmm1,%%xmm8 \n" "punpckldq %%xmm5,%%xmm1 \n" "movlpd %%xmm1,(%1) \n" "movhpd %%xmm1,(%2) \n" "punpckhdq %%xmm5,%%xmm8 \n" "movlpd %%xmm8,(%1,%5) \n" "lea (%1,%5,2),%1 \n" "movhpd %%xmm8,(%2,%6) \n" "lea (%2,%6,2),%2 \n" "movdqa %%xmm3,%%xmm8 \n" "punpckldq %%xmm7,%%xmm3 \n" "movlpd %%xmm3,(%1) \n" "movhpd %%xmm3,(%2) \n" "punpckhdq %%xmm7,%%xmm8 \n" "sub $0x8,%3 \n" "movlpd %%xmm8,(%1,%5) \n" "lea (%1,%5,2),%1 \n" "movhpd %%xmm8,(%2,%6) \n" "lea (%2,%6,2),%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst_a), // %1 "+r"(dst_b), // %2 "+r"(width) // %3 : "r"((intptr_t)(src_stride)), // %4 "r"((intptr_t)(dst_stride_a)), // %5 "r"((intptr_t)(dst_stride_b)) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9"); } #endif // defined(HAS_TRANSPOSEUVWX8_SSE2) #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/rotate_mmi.cc000066400000000000000000000360301416500237200214770ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/rotate_row.h" #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for Mips MMI. #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) void TransposeWx8_MMI(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) { uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; uint8_t* src_tmp = nullptr; __asm__ volatile( "1: \n\t" "ldc1 %[tmp12], 0x00(%[src]) \n\t" "dadd %[src_tmp], %[src], %[src_stride] \n\t" "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" /* tmp0 = (00 10 01 11 02 12 03 13) */ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" /* tmp1 = (04 14 05 15 06 16 07 17) */ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" /* tmp2 = (20 30 21 31 22 32 23 33) */ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" /* tmp3 = (24 34 25 35 26 36 27 37) */ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" /* tmp4 = (00 10 20 30 01 11 21 31) */ "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" /* tmp5 = (02 12 22 32 03 13 23 33) */ "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" /* tmp6 = (04 14 24 34 05 15 25 35) */ "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" /* tmp7 = (06 16 26 36 07 17 27 37) */ "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" /* tmp0 = (40 50 41 51 42 52 43 53) */ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" /* tmp1 = (44 54 45 55 46 56 47 57) */ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" /* tmp2 = (60 70 61 71 62 72 63 73) */ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" /* tmp3 = (64 74 65 75 66 76 67 77) */ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" /* tmp8 = (40 50 60 70 41 51 61 71) */ "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" /* tmp9 = (42 52 62 72 43 53 63 73) */ "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" /* tmp10 = (44 54 64 74 45 55 65 75) */ "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" /* tmp11 = (46 56 66 76 47 57 67 77) */ "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" /* tmp0 = (00 10 20 30 40 50 60 70) */ "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" /* tmp1 = (01 11 21 31 41 51 61 71) */ "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" "dadd %[dst], %[dst], %[dst_stride] \n\t" "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" /* tmp0 = (02 12 22 32 42 52 62 72) */ "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" /* tmp1 = (03 13 23 33 43 53 63 73) */ "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" "dadd %[dst], %[dst], %[dst_stride] \n\t" "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" "dadd %[dst], %[dst], %[dst_stride] \n\t" "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" /* tmp0 = (04 14 24 34 44 54 64 74) */ "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" /* tmp1 = (05 15 25 35 45 55 65 75) */ "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" "dadd %[dst], %[dst], %[dst_stride] \n\t" "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" "dadd %[dst], %[dst], %[dst_stride] \n\t" "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" /* tmp0 = (06 16 26 36 46 56 66 76) */ "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" /* tmp1 = (07 17 27 37 47 57 67 77) */ "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" "dadd %[dst], %[dst], %[dst_stride] \n\t" "gssdlc1 %[tmp0], 0x07(%[dst]) \n\t" "gssdrc1 %[tmp0], 0x00(%[dst]) \n\t" "dadd %[dst], %[dst], %[dst_stride] \n\t" "gssdlc1 %[tmp1], 0x07(%[dst]) \n\t" "gssdrc1 %[tmp1], 0x00(%[dst]) \n\t" "dadd %[dst], %[dst], %[dst_stride] \n\t" "daddi %[src], %[src], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst] "+&r"(dst), [src_tmp] "+&r"(src_tmp) : [src] "r"(src), [width] "r"(width), [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride) : "memory"); } void TransposeUVWx8_MMI(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) { uint64_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6; uint64_t tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13; uint8_t* src_tmp = nullptr; __asm__ volatile( "1: \n\t" /* tmp12 = (u00 v00 u01 v01 u02 v02 u03 v03) */ "ldc1 %[tmp12], 0x00(%[src]) \n\t" "dadd %[src_tmp], %[src], %[src_stride] \n\t" /* tmp13 = (u10 v10 u11 v11 u12 v12 u13 v13) */ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" /* tmp0 = (u00 u10 v00 v10 u01 u11 v01 v11) */ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" /* tmp1 = (u02 u12 v02 v12 u03 u13 v03 v13) */ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" /* tmp12 = (u20 v20 u21 v21 u22 v22 u23 v23) */ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" /* tmp13 = (u30 v30 u31 v31 u32 v32 u33 v33) */ "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" /* tmp2 = (u20 u30 v20 v30 u21 u31 v21 v31) */ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" /* tmp3 = (u22 u32 v22 v32 u23 u33 v23 v33) */ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" /* tmp4 = (u00 u10 u20 u30 v00 v10 v20 v30) */ "punpcklhw %[tmp4], %[tmp0], %[tmp2] \n\t" /* tmp5 = (u01 u11 u21 u31 v01 v11 v21 v31) */ "punpckhhw %[tmp5], %[tmp0], %[tmp2] \n\t" /* tmp6 = (u02 u12 u22 u32 v02 v12 v22 v32) */ "punpcklhw %[tmp6], %[tmp1], %[tmp3] \n\t" /* tmp7 = (u03 u13 u23 u33 v03 v13 v23 v33) */ "punpckhhw %[tmp7], %[tmp1], %[tmp3] \n\t" "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" /* tmp12 = (u40 v40 u41 v41 u42 v42 u43 v43) */ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" /* tmp13 = (u50 v50 u51 v51 u52 v52 u53 v53) */ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" /* tmp0 = (u40 u50 v40 v50 u41 u51 v41 v51) */ "punpcklbh %[tmp0], %[tmp12], %[tmp13] \n\t" /* tmp1 = (u42 u52 v42 v52 u43 u53 v43 v53) */ "punpckhbh %[tmp1], %[tmp12], %[tmp13] \n\t" "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" /* tmp12 = (u60 v60 u61 v61 u62 v62 u63 v63) */ "ldc1 %[tmp12], 0x00(%[src_tmp]) \n\t" /* tmp13 = (u70 v70 u71 v71 u72 v72 u73 v73) */ "dadd %[src_tmp], %[src_tmp], %[src_stride] \n\t" "ldc1 %[tmp13], 0x00(%[src_tmp]) \n\t" /* tmp2 = (u60 u70 v60 v70 u61 u71 v61 v71) */ "punpcklbh %[tmp2], %[tmp12], %[tmp13] \n\t" /* tmp3 = (u62 u72 v62 v72 u63 u73 v63 v73) */ "punpckhbh %[tmp3], %[tmp12], %[tmp13] \n\t" /* tmp8 = (u40 u50 u60 u70 v40 v50 v60 v70) */ "punpcklhw %[tmp8], %[tmp0], %[tmp2] \n\t" /* tmp9 = (u41 u51 u61 u71 v41 v51 v61 v71) */ "punpckhhw %[tmp9], %[tmp0], %[tmp2] \n\t" /* tmp10 = (u42 u52 u62 u72 v42 v52 v62 v72) */ "punpcklhw %[tmp10], %[tmp1], %[tmp3] \n\t" /* tmp11 = (u43 u53 u63 u73 v43 v53 v63 v73) */ "punpckhhw %[tmp11], %[tmp1], %[tmp3] \n\t" /* tmp0 = (u00 u10 u20 u30 u40 u50 u60 u70) */ "punpcklwd %[tmp0], %[tmp4], %[tmp8] \n\t" /* tmp1 = (v00 v10 v20 v30 v40 v50 v60 v70) */ "punpckhwd %[tmp1], %[tmp4], %[tmp8] \n\t" "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" /* tmp0 = (u01 u11 u21 u31 u41 u51 u61 u71) */ "punpcklwd %[tmp0], %[tmp5], %[tmp9] \n\t" /* tmp1 = (v01 v11 v21 v31 v41 v51 v61 v71) */ "punpckhwd %[tmp1], %[tmp5], %[tmp9] \n\t" "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" /* tmp0 = (u02 u12 u22 u32 u42 u52 u62 u72) */ "punpcklwd %[tmp0], %[tmp6], %[tmp10] \n\t" /* tmp1 = (v02 v12 v22 v32 v42 v52 v62 v72) */ "punpckhwd %[tmp1], %[tmp6], %[tmp10] \n\t" "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" /* tmp0 = (u03 u13 u23 u33 u43 u53 u63 u73) */ "punpcklwd %[tmp0], %[tmp7], %[tmp11] \n\t" /* tmp1 = (v03 v13 v23 v33 v43 v53 v63 v73) */ "punpckhwd %[tmp1], %[tmp7], %[tmp11] \n\t" "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" "gssdlc1 %[tmp0], 0x07(%[dst_a]) \n\t" "gssdrc1 %[tmp0], 0x00(%[dst_a]) \n\t" "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" "gssdlc1 %[tmp1], 0x07(%[dst_b]) \n\t" "gssdrc1 %[tmp1], 0x00(%[dst_b]) \n\t" "dadd %[dst_a], %[dst_a], %[dst_stride_a] \n\t" "dadd %[dst_b], %[dst_b], %[dst_stride_b] \n\t" "daddiu %[src], %[src], 0x08 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), [tmp3] "=&f"(tmp3), [tmp4] "=&f"(tmp4), [tmp5] "=&f"(tmp5), [tmp6] "=&f"(tmp6), [tmp7] "=&f"(tmp7), [tmp8] "=&f"(tmp8), [tmp9] "=&f"(tmp9), [tmp10] "=&f"(tmp10), [tmp11] "=&f"(tmp11), [tmp12] "=&f"(tmp12), [tmp13] "=&f"(tmp13), [dst_a] "+&r"(dst_a), [dst_b] "+&r"(dst_b), [src_tmp] "+&r"(src_tmp) : [src] "r"(src), [width] "r"(width), [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b), [src_stride] "r"(src_stride) : "memory"); } #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/rotate_msa.cc000066400000000000000000000232101416500237200214710ustar00rootroot00000000000000/* * Copyright 2016 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/rotate_row.h" // This module is for GCC MSA #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #include "libyuv/macros_msa.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \ { \ out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0); \ out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0); \ out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2); \ out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2); \ } #define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \ { \ out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0); \ out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0); \ out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2); \ out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2); \ } #define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \ { \ out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0); \ out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0); \ out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2); \ out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2); \ } #define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \ { \ out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0); \ out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0); \ out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2); \ out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2); \ } void TransposeWx16_C(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) { TransposeWx8_C(src, src_stride, dst, dst_stride, width); TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride, width); } void TransposeUVWx16_C(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) { TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width); TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8), dst_stride_a, (dst_b + 8), dst_stride_b, width); } void TransposeWx16_MSA(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) { int x; const uint8_t* s; v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; for (x = 0; x < width; x += 16) { s = src; src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); dst += dst_stride * 4; res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); dst += dst_stride * 4; res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); dst += dst_stride * 4; res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride); src += 16; dst += dst_stride * 4; } } void TransposeUVWx16_MSA(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) { int x; const uint8_t* s; v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3; v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9; for (x = 0; x < width; x += 8) { s = src; src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3); ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7); src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3); src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src1 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src2 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; src3 = (v16u8)__msa_ld_b((v16i8*)s, 0); s += src_stride; ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3); ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7); res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0); res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0); ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3); ST_UB2(dst0, dst2, dst_a, dst_stride_a); ST_UB2(dst1, dst3, dst_b, dst_stride_b); dst_a += dst_stride_a * 2; dst_b += dst_stride_b * 2; res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1); res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1); ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3); ST_UB2(dst0, dst2, dst_a, dst_stride_a); ST_UB2(dst1, dst3, dst_b, dst_stride_b); dst_a += dst_stride_a * 2; dst_b += dst_stride_b * 2; res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2); res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2); ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3); ST_UB2(dst0, dst2, dst_a, dst_stride_a); ST_UB2(dst1, dst3, dst_b, dst_stride_b); dst_a += dst_stride_a * 2; dst_b += dst_stride_b * 2; res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3); res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3); ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3); ST_UB2(dst0, dst2, dst_a, dst_stride_a); ST_UB2(dst1, dst3, dst_b, dst_stride_b); src += 16; dst_a += dst_stride_a * 2; dst_b += dst_stride_b * 2; } } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) libyuv-0.0~git20220104.b91df1a/source/rotate_neon.cc000066400000000000000000000421001416500237200216470ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/rotate_row.h" #include "libyuv/row.h" #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; void TransposeWx8_NEON(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) { const uint8_t* src_temp; asm volatile( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this "sub %5, #8 \n" // handle 8x8 blocks. this should be the majority of the plane "1: \n" "mov %0, %1 \n" "vld1.8 {d0}, [%0], %2 \n" "vld1.8 {d1}, [%0], %2 \n" "vld1.8 {d2}, [%0], %2 \n" "vld1.8 {d3}, [%0], %2 \n" "vld1.8 {d4}, [%0], %2 \n" "vld1.8 {d5}, [%0], %2 \n" "vld1.8 {d6}, [%0], %2 \n" "vld1.8 {d7}, [%0] \n" "vtrn.8 d1, d0 \n" "vtrn.8 d3, d2 \n" "vtrn.8 d5, d4 \n" "vtrn.8 d7, d6 \n" "vtrn.16 d1, d3 \n" "vtrn.16 d0, d2 \n" "vtrn.16 d5, d7 \n" "vtrn.16 d4, d6 \n" "vtrn.32 d1, d5 \n" "vtrn.32 d0, d4 \n" "vtrn.32 d3, d7 \n" "vtrn.32 d2, d6 \n" "vrev16.8 q0, q0 \n" "vrev16.8 q1, q1 \n" "vrev16.8 q2, q2 \n" "vrev16.8 q3, q3 \n" "mov %0, %3 \n" "vst1.8 {d1}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d3}, [%0], %4 \n" "vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d5}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d7}, [%0], %4 \n" "vst1.8 {d6}, [%0] \n" "add %1, #8 \n" // src += 8 "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride "subs %5, #8 \n" // w -= 8 "bge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. "adds %5, #8 \n" "beq 4f \n" // some residual, so between 1 and 7 lines left to transpose "cmp %5, #2 \n" "blt 3f \n" "cmp %5, #4 \n" "blt 2f \n" // 4x8 block "mov %0, %1 \n" "vld1.32 {d0[0]}, [%0], %2 \n" "vld1.32 {d0[1]}, [%0], %2 \n" "vld1.32 {d1[0]}, [%0], %2 \n" "vld1.32 {d1[1]}, [%0], %2 \n" "vld1.32 {d2[0]}, [%0], %2 \n" "vld1.32 {d2[1]}, [%0], %2 \n" "vld1.32 {d3[0]}, [%0], %2 \n" "vld1.32 {d3[1]}, [%0] \n" "mov %0, %3 \n" "vld1.8 {q3}, [%6] \n" "vtbl.8 d4, {d0, d1}, d6 \n" "vtbl.8 d5, {d0, d1}, d7 \n" "vtbl.8 d0, {d2, d3}, d6 \n" "vtbl.8 d1, {d2, d3}, d7 \n" // TODO(frkoenig): Rework shuffle above to // write out with 4 instead of 8 writes. "vst1.32 {d4[0]}, [%0], %4 \n" "vst1.32 {d4[1]}, [%0], %4 \n" "vst1.32 {d5[0]}, [%0], %4 \n" "vst1.32 {d5[1]}, [%0] \n" "add %0, %3, #4 \n" "vst1.32 {d0[0]}, [%0], %4 \n" "vst1.32 {d0[1]}, [%0], %4 \n" "vst1.32 {d1[0]}, [%0], %4 \n" "vst1.32 {d1[1]}, [%0] \n" "add %1, #4 \n" // src += 4 "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride "subs %5, #4 \n" // w -= 4 "beq 4f \n" // some residual, check to see if it includes a 2x8 block, // or less "cmp %5, #2 \n" "blt 3f \n" // 2x8 block "2: \n" "mov %0, %1 \n" "vld1.16 {d0[0]}, [%0], %2 \n" "vld1.16 {d1[0]}, [%0], %2 \n" "vld1.16 {d0[1]}, [%0], %2 \n" "vld1.16 {d1[1]}, [%0], %2 \n" "vld1.16 {d0[2]}, [%0], %2 \n" "vld1.16 {d1[2]}, [%0], %2 \n" "vld1.16 {d0[3]}, [%0], %2 \n" "vld1.16 {d1[3]}, [%0] \n" "vtrn.8 d0, d1 \n" "mov %0, %3 \n" "vst1.64 {d0}, [%0], %4 \n" "vst1.64 {d1}, [%0] \n" "add %1, #2 \n" // src += 2 "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride "subs %5, #2 \n" // w -= 2 "beq 4f \n" // 1x8 block "3: \n" "vld1.8 {d0[0]}, [%1], %2 \n" "vld1.8 {d0[1]}, [%1], %2 \n" "vld1.8 {d0[2]}, [%1], %2 \n" "vld1.8 {d0[3]}, [%1], %2 \n" "vld1.8 {d0[4]}, [%1], %2 \n" "vld1.8 {d0[5]}, [%1], %2 \n" "vld1.8 {d0[6]}, [%1], %2 \n" "vld1.8 {d0[7]}, [%1] \n" "vst1.64 {d0}, [%3] \n" "4: \n" : "=&r"(src_temp), // %0 "+r"(src), // %1 "+r"(src_stride), // %2 "+r"(dst), // %3 "+r"(dst_stride), // %4 "+r"(width) // %5 : "r"(&kVTbl4x4Transpose) // %6 : "memory", "cc", "q0", "q1", "q2", "q3"); } static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15}; void TransposeUVWx8_NEON(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) { const uint8_t* src_temp; asm volatile( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this "sub %7, #8 \n" // handle 8x8 blocks. this should be the majority of the plane "1: \n" "mov %0, %1 \n" "vld2.8 {d0, d1}, [%0], %2 \n" "vld2.8 {d2, d3}, [%0], %2 \n" "vld2.8 {d4, d5}, [%0], %2 \n" "vld2.8 {d6, d7}, [%0], %2 \n" "vld2.8 {d16, d17}, [%0], %2 \n" "vld2.8 {d18, d19}, [%0], %2 \n" "vld2.8 {d20, d21}, [%0], %2 \n" "vld2.8 {d22, d23}, [%0] \n" "vtrn.8 q1, q0 \n" "vtrn.8 q3, q2 \n" "vtrn.8 q9, q8 \n" "vtrn.8 q11, q10 \n" "vtrn.16 q1, q3 \n" "vtrn.16 q0, q2 \n" "vtrn.16 q9, q11 \n" "vtrn.16 q8, q10 \n" "vtrn.32 q1, q9 \n" "vtrn.32 q0, q8 \n" "vtrn.32 q3, q11 \n" "vtrn.32 q2, q10 \n" "vrev16.8 q0, q0 \n" "vrev16.8 q1, q1 \n" "vrev16.8 q2, q2 \n" "vrev16.8 q3, q3 \n" "vrev16.8 q8, q8 \n" "vrev16.8 q9, q9 \n" "vrev16.8 q10, q10 \n" "vrev16.8 q11, q11 \n" "mov %0, %3 \n" "vst1.8 {d2}, [%0], %4 \n" "vst1.8 {d0}, [%0], %4 \n" "vst1.8 {d6}, [%0], %4 \n" "vst1.8 {d4}, [%0], %4 \n" "vst1.8 {d18}, [%0], %4 \n" "vst1.8 {d16}, [%0], %4 \n" "vst1.8 {d22}, [%0], %4 \n" "vst1.8 {d20}, [%0] \n" "mov %0, %5 \n" "vst1.8 {d3}, [%0], %6 \n" "vst1.8 {d1}, [%0], %6 \n" "vst1.8 {d7}, [%0], %6 \n" "vst1.8 {d5}, [%0], %6 \n" "vst1.8 {d19}, [%0], %6 \n" "vst1.8 {d17}, [%0], %6 \n" "vst1.8 {d23}, [%0], %6 \n" "vst1.8 {d21}, [%0] \n" "add %1, #8*2 \n" // src += 8*2 "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * // dst_stride_a "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * // dst_stride_b "subs %7, #8 \n" // w -= 8 "bge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. "adds %7, #8 \n" "beq 4f \n" // some residual, so between 1 and 7 lines left to transpose "cmp %7, #2 \n" "blt 3f \n" "cmp %7, #4 \n" "blt 2f \n" // TODO(frkoenig): Clean this up // 4x8 block "mov %0, %1 \n" "vld1.64 {d0}, [%0], %2 \n" "vld1.64 {d1}, [%0], %2 \n" "vld1.64 {d2}, [%0], %2 \n" "vld1.64 {d3}, [%0], %2 \n" "vld1.64 {d4}, [%0], %2 \n" "vld1.64 {d5}, [%0], %2 \n" "vld1.64 {d6}, [%0], %2 \n" "vld1.64 {d7}, [%0] \n" "vld1.8 {q15}, [%8] \n" "vtrn.8 q0, q1 \n" "vtrn.8 q2, q3 \n" "vtbl.8 d16, {d0, d1}, d30 \n" "vtbl.8 d17, {d0, d1}, d31 \n" "vtbl.8 d18, {d2, d3}, d30 \n" "vtbl.8 d19, {d2, d3}, d31 \n" "vtbl.8 d20, {d4, d5}, d30 \n" "vtbl.8 d21, {d4, d5}, d31 \n" "vtbl.8 d22, {d6, d7}, d30 \n" "vtbl.8 d23, {d6, d7}, d31 \n" "mov %0, %3 \n" "vst1.32 {d16[0]}, [%0], %4 \n" "vst1.32 {d16[1]}, [%0], %4 \n" "vst1.32 {d17[0]}, [%0], %4 \n" "vst1.32 {d17[1]}, [%0], %4 \n" "add %0, %3, #4 \n" "vst1.32 {d20[0]}, [%0], %4 \n" "vst1.32 {d20[1]}, [%0], %4 \n" "vst1.32 {d21[0]}, [%0], %4 \n" "vst1.32 {d21[1]}, [%0] \n" "mov %0, %5 \n" "vst1.32 {d18[0]}, [%0], %6 \n" "vst1.32 {d18[1]}, [%0], %6 \n" "vst1.32 {d19[0]}, [%0], %6 \n" "vst1.32 {d19[1]}, [%0], %6 \n" "add %0, %5, #4 \n" "vst1.32 {d22[0]}, [%0], %6 \n" "vst1.32 {d22[1]}, [%0], %6 \n" "vst1.32 {d23[0]}, [%0], %6 \n" "vst1.32 {d23[1]}, [%0] \n" "add %1, #4*2 \n" // src += 4 * 2 "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * // dst_stride_a "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * // dst_stride_b "subs %7, #4 \n" // w -= 4 "beq 4f \n" // some residual, check to see if it includes a 2x8 block, // or less "cmp %7, #2 \n" "blt 3f \n" // 2x8 block "2: \n" "mov %0, %1 \n" "vld2.16 {d0[0], d2[0]}, [%0], %2 \n" "vld2.16 {d1[0], d3[0]}, [%0], %2 \n" "vld2.16 {d0[1], d2[1]}, [%0], %2 \n" "vld2.16 {d1[1], d3[1]}, [%0], %2 \n" "vld2.16 {d0[2], d2[2]}, [%0], %2 \n" "vld2.16 {d1[2], d3[2]}, [%0], %2 \n" "vld2.16 {d0[3], d2[3]}, [%0], %2 \n" "vld2.16 {d1[3], d3[3]}, [%0] \n" "vtrn.8 d0, d1 \n" "vtrn.8 d2, d3 \n" "mov %0, %3 \n" "vst1.64 {d0}, [%0], %4 \n" "vst1.64 {d2}, [%0] \n" "mov %0, %5 \n" "vst1.64 {d1}, [%0], %6 \n" "vst1.64 {d3}, [%0] \n" "add %1, #2*2 \n" // src += 2 * 2 "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * // dst_stride_a "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * // dst_stride_b "subs %7, #2 \n" // w -= 2 "beq 4f \n" // 1x8 block "3: \n" "vld2.8 {d0[0], d1[0]}, [%1], %2 \n" "vld2.8 {d0[1], d1[1]}, [%1], %2 \n" "vld2.8 {d0[2], d1[2]}, [%1], %2 \n" "vld2.8 {d0[3], d1[3]}, [%1], %2 \n" "vld2.8 {d0[4], d1[4]}, [%1], %2 \n" "vld2.8 {d0[5], d1[5]}, [%1], %2 \n" "vld2.8 {d0[6], d1[6]}, [%1], %2 \n" "vld2.8 {d0[7], d1[7]}, [%1] \n" "vst1.64 {d0}, [%3] \n" "vst1.64 {d1}, [%5] \n" "4: \n" : "=&r"(src_temp), // %0 "+r"(src), // %1 "+r"(src_stride), // %2 "+r"(dst_a), // %3 "+r"(dst_stride_a), // %4 "+r"(dst_b), // %5 "+r"(dst_stride_b), // %6 "+r"(width) // %7 : "r"(&kVTbl4x4TransposeDi) // %8 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } #endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/rotate_neon64.cc000066400000000000000000000461421416500237200220330ustar00rootroot00000000000000/* * Copyright 2014 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/rotate_row.h" #include "libyuv/row.h" #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; void TransposeWx8_NEON(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) { const uint8_t* src_temp; asm volatile( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this "sub %w3, %w3, #8 \n" // handle 8x8 blocks. this should be the majority of the plane "1: \n" "mov %0, %1 \n" "ld1 {v0.8b}, [%0], %5 \n" "ld1 {v1.8b}, [%0], %5 \n" "ld1 {v2.8b}, [%0], %5 \n" "ld1 {v3.8b}, [%0], %5 \n" "ld1 {v4.8b}, [%0], %5 \n" "ld1 {v5.8b}, [%0], %5 \n" "ld1 {v6.8b}, [%0], %5 \n" "ld1 {v7.8b}, [%0] \n" "mov %0, %1 \n" "trn2 v16.8b, v0.8b, v1.8b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "trn1 v17.8b, v0.8b, v1.8b \n" "add %0, %0, %5 \n" "trn2 v18.8b, v2.8b, v3.8b \n" "prfm pldl1keep, [%0, 448] \n" // row 1 "trn1 v19.8b, v2.8b, v3.8b \n" "add %0, %0, %5 \n" "trn2 v20.8b, v4.8b, v5.8b \n" "prfm pldl1keep, [%0, 448] \n" // row 2 "trn1 v21.8b, v4.8b, v5.8b \n" "add %0, %0, %5 \n" "trn2 v22.8b, v6.8b, v7.8b \n" "prfm pldl1keep, [%0, 448] \n" // row 3 "trn1 v23.8b, v6.8b, v7.8b \n" "add %0, %0, %5 \n" "trn2 v3.4h, v17.4h, v19.4h \n" "prfm pldl1keep, [%0, 448] \n" // row 4 "trn1 v1.4h, v17.4h, v19.4h \n" "add %0, %0, %5 \n" "trn2 v2.4h, v16.4h, v18.4h \n" "prfm pldl1keep, [%0, 448] \n" // row 5 "trn1 v0.4h, v16.4h, v18.4h \n" "add %0, %0, %5 \n" "trn2 v7.4h, v21.4h, v23.4h \n" "prfm pldl1keep, [%0, 448] \n" // row 6 "trn1 v5.4h, v21.4h, v23.4h \n" "add %0, %0, %5 \n" "trn2 v6.4h, v20.4h, v22.4h \n" "prfm pldl1keep, [%0, 448] \n" // row 7 "trn1 v4.4h, v20.4h, v22.4h \n" "trn2 v21.2s, v1.2s, v5.2s \n" "trn1 v17.2s, v1.2s, v5.2s \n" "trn2 v20.2s, v0.2s, v4.2s \n" "trn1 v16.2s, v0.2s, v4.2s \n" "trn2 v23.2s, v3.2s, v7.2s \n" "trn1 v19.2s, v3.2s, v7.2s \n" "trn2 v22.2s, v2.2s, v6.2s \n" "trn1 v18.2s, v2.2s, v6.2s \n" "mov %0, %2 \n" "st1 {v17.8b}, [%0], %6 \n" "st1 {v16.8b}, [%0], %6 \n" "st1 {v19.8b}, [%0], %6 \n" "st1 {v18.8b}, [%0], %6 \n" "st1 {v21.8b}, [%0], %6 \n" "st1 {v20.8b}, [%0], %6 \n" "st1 {v23.8b}, [%0], %6 \n" "st1 {v22.8b}, [%0] \n" "add %1, %1, #8 \n" // src += 8 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride "subs %w3, %w3, #8 \n" // w -= 8 "b.ge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. "adds %w3, %w3, #8 \n" "b.eq 4f \n" // some residual, so between 1 and 7 lines left to transpose "cmp %w3, #2 \n" "b.lt 3f \n" "cmp %w3, #4 \n" "b.lt 2f \n" // 4x8 block "mov %0, %1 \n" "ld1 {v0.s}[0], [%0], %5 \n" "ld1 {v0.s}[1], [%0], %5 \n" "ld1 {v0.s}[2], [%0], %5 \n" "ld1 {v0.s}[3], [%0], %5 \n" "ld1 {v1.s}[0], [%0], %5 \n" "ld1 {v1.s}[1], [%0], %5 \n" "ld1 {v1.s}[2], [%0], %5 \n" "ld1 {v1.s}[3], [%0] \n" "mov %0, %2 \n" "ld1 {v2.16b}, [%4] \n" "tbl v3.16b, {v0.16b}, v2.16b \n" "tbl v0.16b, {v1.16b}, v2.16b \n" // TODO(frkoenig): Rework shuffle above to // write out with 4 instead of 8 writes. "st1 {v3.s}[0], [%0], %6 \n" "st1 {v3.s}[1], [%0], %6 \n" "st1 {v3.s}[2], [%0], %6 \n" "st1 {v3.s}[3], [%0] \n" "add %0, %2, #4 \n" "st1 {v0.s}[0], [%0], %6 \n" "st1 {v0.s}[1], [%0], %6 \n" "st1 {v0.s}[2], [%0], %6 \n" "st1 {v0.s}[3], [%0] \n" "add %1, %1, #4 \n" // src += 4 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride "subs %w3, %w3, #4 \n" // w -= 4 "b.eq 4f \n" // some residual, check to see if it includes a 2x8 block, // or less "cmp %w3, #2 \n" "b.lt 3f \n" // 2x8 block "2: \n" "mov %0, %1 \n" "ld1 {v0.h}[0], [%0], %5 \n" "ld1 {v1.h}[0], [%0], %5 \n" "ld1 {v0.h}[1], [%0], %5 \n" "ld1 {v1.h}[1], [%0], %5 \n" "ld1 {v0.h}[2], [%0], %5 \n" "ld1 {v1.h}[2], [%0], %5 \n" "ld1 {v0.h}[3], [%0], %5 \n" "ld1 {v1.h}[3], [%0] \n" "trn2 v2.8b, v0.8b, v1.8b \n" "trn1 v3.8b, v0.8b, v1.8b \n" "mov %0, %2 \n" "st1 {v3.8b}, [%0], %6 \n" "st1 {v2.8b}, [%0] \n" "add %1, %1, #2 \n" // src += 2 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride "subs %w3, %w3, #2 \n" // w -= 2 "b.eq 4f \n" // 1x8 block "3: \n" "ld1 {v0.b}[0], [%1], %5 \n" "ld1 {v0.b}[1], [%1], %5 \n" "ld1 {v0.b}[2], [%1], %5 \n" "ld1 {v0.b}[3], [%1], %5 \n" "ld1 {v0.b}[4], [%1], %5 \n" "ld1 {v0.b}[5], [%1], %5 \n" "ld1 {v0.b}[6], [%1], %5 \n" "ld1 {v0.b}[7], [%1] \n" "st1 {v0.8b}, [%2] \n" "4: \n" : "=&r"(src_temp), // %0 "+r"(src), // %1 "+r"(dst), // %2 "+r"(width) // %3 : "r"(&kVTbl4x4Transpose), // %4 "r"(static_cast(src_stride)), // %5 "r"(static_cast(dst_stride)) // %6 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } static const uint8_t kVTbl4x4TransposeDi[32] = { 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54, 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55}; void TransposeUVWx8_NEON(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int width) { const uint8_t* src_temp; asm volatile( // loops are on blocks of 8. loop will stop when // counter gets to or below 0. starting the counter // at w-8 allow for this "sub %w4, %w4, #8 \n" // handle 8x8 blocks. this should be the majority of the plane "1: \n" "mov %0, %1 \n" "ld1 {v0.16b}, [%0], %5 \n" "ld1 {v1.16b}, [%0], %5 \n" "ld1 {v2.16b}, [%0], %5 \n" "ld1 {v3.16b}, [%0], %5 \n" "ld1 {v4.16b}, [%0], %5 \n" "ld1 {v5.16b}, [%0], %5 \n" "ld1 {v6.16b}, [%0], %5 \n" "ld1 {v7.16b}, [%0] \n" "mov %0, %1 \n" "trn1 v16.16b, v0.16b, v1.16b \n" "trn2 v17.16b, v0.16b, v1.16b \n" "trn1 v18.16b, v2.16b, v3.16b \n" "trn2 v19.16b, v2.16b, v3.16b \n" "trn1 v20.16b, v4.16b, v5.16b \n" "trn2 v21.16b, v4.16b, v5.16b \n" "trn1 v22.16b, v6.16b, v7.16b \n" "trn2 v23.16b, v6.16b, v7.16b \n" "trn1 v0.8h, v16.8h, v18.8h \n" "trn2 v1.8h, v16.8h, v18.8h \n" "trn1 v2.8h, v20.8h, v22.8h \n" "trn2 v3.8h, v20.8h, v22.8h \n" "trn1 v4.8h, v17.8h, v19.8h \n" "trn2 v5.8h, v17.8h, v19.8h \n" "trn1 v6.8h, v21.8h, v23.8h \n" "trn2 v7.8h, v21.8h, v23.8h \n" "trn1 v16.4s, v0.4s, v2.4s \n" "trn2 v17.4s, v0.4s, v2.4s \n" "trn1 v18.4s, v1.4s, v3.4s \n" "trn2 v19.4s, v1.4s, v3.4s \n" "trn1 v20.4s, v4.4s, v6.4s \n" "trn2 v21.4s, v4.4s, v6.4s \n" "trn1 v22.4s, v5.4s, v7.4s \n" "trn2 v23.4s, v5.4s, v7.4s \n" "mov %0, %2 \n" "st1 {v16.d}[0], [%0], %6 \n" "st1 {v18.d}[0], [%0], %6 \n" "st1 {v17.d}[0], [%0], %6 \n" "st1 {v19.d}[0], [%0], %6 \n" "st1 {v16.d}[1], [%0], %6 \n" "st1 {v18.d}[1], [%0], %6 \n" "st1 {v17.d}[1], [%0], %6 \n" "st1 {v19.d}[1], [%0] \n" "mov %0, %3 \n" "st1 {v20.d}[0], [%0], %7 \n" "st1 {v22.d}[0], [%0], %7 \n" "st1 {v21.d}[0], [%0], %7 \n" "st1 {v23.d}[0], [%0], %7 \n" "st1 {v20.d}[1], [%0], %7 \n" "st1 {v22.d}[1], [%0], %7 \n" "st1 {v21.d}[1], [%0], %7 \n" "st1 {v23.d}[1], [%0] \n" "add %1, %1, #16 \n" // src += 8*2 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * // dst_stride_a "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * // dst_stride_b "subs %w4, %w4, #8 \n" // w -= 8 "b.ge 1b \n" // add 8 back to counter. if the result is 0 there are // no residuals. "adds %w4, %w4, #8 \n" "b.eq 4f \n" // some residual, so between 1 and 7 lines left to transpose "cmp %w4, #2 \n" "b.lt 3f \n" "cmp %w4, #4 \n" "b.lt 2f \n" // TODO(frkoenig): Clean this up // 4x8 block "mov %0, %1 \n" "ld1 {v0.8b}, [%0], %5 \n" "ld1 {v1.8b}, [%0], %5 \n" "ld1 {v2.8b}, [%0], %5 \n" "ld1 {v3.8b}, [%0], %5 \n" "ld1 {v4.8b}, [%0], %5 \n" "ld1 {v5.8b}, [%0], %5 \n" "ld1 {v6.8b}, [%0], %5 \n" "ld1 {v7.8b}, [%0] \n" "ld1 {v30.16b}, [%8], #16 \n" "ld1 {v31.16b}, [%8] \n" "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n" "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n" "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n" "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n" "mov %0, %2 \n" "st1 {v16.s}[0], [%0], %6 \n" "st1 {v16.s}[1], [%0], %6 \n" "st1 {v16.s}[2], [%0], %6 \n" "st1 {v16.s}[3], [%0], %6 \n" "add %0, %2, #4 \n" "st1 {v18.s}[0], [%0], %6 \n" "st1 {v18.s}[1], [%0], %6 \n" "st1 {v18.s}[2], [%0], %6 \n" "st1 {v18.s}[3], [%0] \n" "mov %0, %3 \n" "st1 {v17.s}[0], [%0], %7 \n" "st1 {v17.s}[1], [%0], %7 \n" "st1 {v17.s}[2], [%0], %7 \n" "st1 {v17.s}[3], [%0], %7 \n" "add %0, %3, #4 \n" "st1 {v19.s}[0], [%0], %7 \n" "st1 {v19.s}[1], [%0], %7 \n" "st1 {v19.s}[2], [%0], %7 \n" "st1 {v19.s}[3], [%0] \n" "add %1, %1, #8 \n" // src += 4 * 2 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * // dst_stride_a "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * // dst_stride_b "subs %w4, %w4, #4 \n" // w -= 4 "b.eq 4f \n" // some residual, check to see if it includes a 2x8 block, // or less "cmp %w4, #2 \n" "b.lt 3f \n" // 2x8 block "2: \n" "mov %0, %1 \n" "ld2 {v0.h, v1.h}[0], [%0], %5 \n" "ld2 {v2.h, v3.h}[0], [%0], %5 \n" "ld2 {v0.h, v1.h}[1], [%0], %5 \n" "ld2 {v2.h, v3.h}[1], [%0], %5 \n" "ld2 {v0.h, v1.h}[2], [%0], %5 \n" "ld2 {v2.h, v3.h}[2], [%0], %5 \n" "ld2 {v0.h, v1.h}[3], [%0], %5 \n" "ld2 {v2.h, v3.h}[3], [%0] \n" "trn1 v4.8b, v0.8b, v2.8b \n" "trn2 v5.8b, v0.8b, v2.8b \n" "trn1 v6.8b, v1.8b, v3.8b \n" "trn2 v7.8b, v1.8b, v3.8b \n" "mov %0, %2 \n" "st1 {v4.d}[0], [%0], %6 \n" "st1 {v6.d}[0], [%0] \n" "mov %0, %3 \n" "st1 {v5.d}[0], [%0], %7 \n" "st1 {v7.d}[0], [%0] \n" "add %1, %1, #4 \n" // src += 2 * 2 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * // dst_stride_a "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * // dst_stride_b "subs %w4, %w4, #2 \n" // w -= 2 "b.eq 4f \n" // 1x8 block "3: \n" "ld2 {v0.b, v1.b}[0], [%1], %5 \n" "ld2 {v0.b, v1.b}[1], [%1], %5 \n" "ld2 {v0.b, v1.b}[2], [%1], %5 \n" "ld2 {v0.b, v1.b}[3], [%1], %5 \n" "ld2 {v0.b, v1.b}[4], [%1], %5 \n" "ld2 {v0.b, v1.b}[5], [%1], %5 \n" "ld2 {v0.b, v1.b}[6], [%1], %5 \n" "ld2 {v0.b, v1.b}[7], [%1] \n" "st1 {v0.d}[0], [%2] \n" "st1 {v1.d}[0], [%3] \n" "4: \n" : "=&r"(src_temp), // %0 "+r"(src), // %1 "+r"(dst_a), // %2 "+r"(dst_b), // %3 "+r"(width) // %4 : "r"(static_cast(src_stride)), // %5 "r"(static_cast(dst_stride_a)), // %6 "r"(static_cast(dst_stride_b)), // %7 "r"(&kVTbl4x4TransposeDi) // %8 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/rotate_win.cc000066400000000000000000000170731416500237200215200ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/rotate_row.h" #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for 32 bit Visual C x86 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ !defined(__clang__) && defined(_M_IX86) __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width) { __asm { push edi push esi push ebp mov eax, [esp + 12 + 4] // src mov edi, [esp + 12 + 8] // src_stride mov edx, [esp + 12 + 12] // dst mov esi, [esp + 12 + 16] // dst_stride mov ecx, [esp + 12 + 20] // width // Read in the data from the source pointer. // First round of bit swap. align 4 convertloop: movq xmm0, qword ptr [eax] lea ebp, [eax + 8] movq xmm1, qword ptr [eax + edi] lea eax, [eax + 2 * edi] punpcklbw xmm0, xmm1 movq xmm2, qword ptr [eax] movdqa xmm1, xmm0 palignr xmm1, xmm1, 8 movq xmm3, qword ptr [eax + edi] lea eax, [eax + 2 * edi] punpcklbw xmm2, xmm3 movdqa xmm3, xmm2 movq xmm4, qword ptr [eax] palignr xmm3, xmm3, 8 movq xmm5, qword ptr [eax + edi] punpcklbw xmm4, xmm5 lea eax, [eax + 2 * edi] movdqa xmm5, xmm4 movq xmm6, qword ptr [eax] palignr xmm5, xmm5, 8 movq xmm7, qword ptr [eax + edi] punpcklbw xmm6, xmm7 mov eax, ebp movdqa xmm7, xmm6 palignr xmm7, xmm7, 8 // Second round of bit swap. punpcklwd xmm0, xmm2 punpcklwd xmm1, xmm3 movdqa xmm2, xmm0 movdqa xmm3, xmm1 palignr xmm2, xmm2, 8 palignr xmm3, xmm3, 8 punpcklwd xmm4, xmm6 punpcklwd xmm5, xmm7 movdqa xmm6, xmm4 movdqa xmm7, xmm5 palignr xmm6, xmm6, 8 palignr xmm7, xmm7, 8 // Third round of bit swap. // Write to the destination pointer. punpckldq xmm0, xmm4 movq qword ptr [edx], xmm0 movdqa xmm4, xmm0 palignr xmm4, xmm4, 8 movq qword ptr [edx + esi], xmm4 lea edx, [edx + 2 * esi] punpckldq xmm2, xmm6 movdqa xmm6, xmm2 palignr xmm6, xmm6, 8 movq qword ptr [edx], xmm2 punpckldq xmm1, xmm5 movq qword ptr [edx + esi], xmm6 lea edx, [edx + 2 * esi] movdqa xmm5, xmm1 movq qword ptr [edx], xmm1 palignr xmm5, xmm5, 8 punpckldq xmm3, xmm7 movq qword ptr [edx + esi], xmm5 lea edx, [edx + 2 * esi] movq qword ptr [edx], xmm3 movdqa xmm7, xmm3 palignr xmm7, xmm7, 8 sub ecx, 8 movq qword ptr [edx + esi], xmm7 lea edx, [edx + 2 * esi] jg convertloop pop ebp pop esi pop edi ret } } __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, int src_stride, uint8_t* dst_a, int dst_stride_a, uint8_t* dst_b, int dst_stride_b, int w) { __asm { push ebx push esi push edi push ebp mov eax, [esp + 16 + 4] // src mov edi, [esp + 16 + 8] // src_stride mov edx, [esp + 16 + 12] // dst_a mov esi, [esp + 16 + 16] // dst_stride_a mov ebx, [esp + 16 + 20] // dst_b mov ebp, [esp + 16 + 24] // dst_stride_b mov ecx, esp sub esp, 4 + 16 and esp, ~15 mov [esp + 16], ecx mov ecx, [ecx + 16 + 28] // w align 4 // Read in the data from the source pointer. // First round of bit swap. convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm0 // use xmm7 as temp register. punpcklbw xmm0, xmm1 punpckhbw xmm7, xmm1 movdqa xmm1, xmm7 movdqu xmm2, [eax] movdqu xmm3, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm2 punpcklbw xmm2, xmm3 punpckhbw xmm7, xmm3 movdqa xmm3, xmm7 movdqu xmm4, [eax] movdqu xmm5, [eax + edi] lea eax, [eax + 2 * edi] movdqa xmm7, xmm4 punpcklbw xmm4, xmm5 punpckhbw xmm7, xmm5 movdqa xmm5, xmm7 movdqu xmm6, [eax] movdqu xmm7, [eax + edi] lea eax, [eax + 2 * edi] movdqu [esp], xmm5 // backup xmm5 neg edi movdqa xmm5, xmm6 // use xmm5 as temp register. punpcklbw xmm6, xmm7 punpckhbw xmm5, xmm7 movdqa xmm7, xmm5 lea eax, [eax + 8 * edi + 16] neg edi // Second round of bit swap. movdqa xmm5, xmm0 punpcklwd xmm0, xmm2 punpckhwd xmm5, xmm2 movdqa xmm2, xmm5 movdqa xmm5, xmm1 punpcklwd xmm1, xmm3 punpckhwd xmm5, xmm3 movdqa xmm3, xmm5 movdqa xmm5, xmm4 punpcklwd xmm4, xmm6 punpckhwd xmm5, xmm6 movdqa xmm6, xmm5 movdqu xmm5, [esp] // restore xmm5 movdqu [esp], xmm6 // backup xmm6 movdqa xmm6, xmm5 // use xmm6 as temp register. punpcklwd xmm5, xmm7 punpckhwd xmm6, xmm7 movdqa xmm7, xmm6 // Third round of bit swap. // Write to the destination pointer. movdqa xmm6, xmm0 punpckldq xmm0, xmm4 punpckhdq xmm6, xmm4 movdqa xmm4, xmm6 movdqu xmm6, [esp] // restore xmm6 movlpd qword ptr [edx], xmm0 movhpd qword ptr [ebx], xmm0 movlpd qword ptr [edx + esi], xmm4 lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm4 lea ebx, [ebx + 2 * ebp] movdqa xmm0, xmm2 // use xmm0 as the temp register. punpckldq xmm2, xmm6 movlpd qword ptr [edx], xmm2 movhpd qword ptr [ebx], xmm2 punpckhdq xmm0, xmm6 movlpd qword ptr [edx + esi], xmm0 lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] movdqa xmm0, xmm1 // use xmm0 as the temp register. punpckldq xmm1, xmm5 movlpd qword ptr [edx], xmm1 movhpd qword ptr [ebx], xmm1 punpckhdq xmm0, xmm5 movlpd qword ptr [edx + esi], xmm0 lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] movdqa xmm0, xmm3 // use xmm0 as the temp register. punpckldq xmm3, xmm7 movlpd qword ptr [edx], xmm3 movhpd qword ptr [ebx], xmm3 punpckhdq xmm0, xmm7 sub ecx, 8 movlpd qword ptr [edx + esi], xmm0 lea edx, [edx + 2 * esi] movhpd qword ptr [ebx + ebp], xmm0 lea ebx, [ebx + 2 * ebp] jg convertloop mov esp, [esp + 16] pop ebp pop edi pop esi pop ebx ret } } #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/row_any.cc000066400000000000000000002435131416500237200210230ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/row.h" #include // For memset. #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // memset for temp is meant to clear the source buffer (not dest) so that // SIMD that reads full multiple of 16 bytes will not trigger msan errors. // memset is not needed for production, as the garbage values are processed but // not used, although there may be edge cases for subsampling. // The size of the buffer is based on the largest read, which can be inferred // by the source type (e.g. ARGB) and the mask (last parameter), or by examining // the source code for how much the source pointers are advanced. // Subsampled source needs to be increase by 1 of not even. #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) // Any 4 planes to 1 #define ANY41(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ int width) { \ SIMD_ALIGNED(uint8_t temp[64 * 5]); \ memset(temp, 0, 64 * 4); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, n); \ } \ memcpy(temp, y_buf + n, r); \ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ memcpy(temp + 192, a_buf + n, r); \ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, MASK + 1); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_MERGEARGBROW_SSE2 ANY41(MergeARGBRow_Any_SSE2, MergeARGBRow_SSE2, 0, 0, 4, 7) #endif #ifdef HAS_MERGEARGBROW_AVX2 ANY41(MergeARGBRow_Any_AVX2, MergeARGBRow_AVX2, 0, 0, 4, 15) #endif #ifdef HAS_MERGEARGBROW_NEON ANY41(MergeARGBRow_Any_NEON, MergeARGBRow_NEON, 0, 0, 4, 15) #endif // Note that odd width replication includes 444 due to implementation // on arm that subsamples 444 to 422 internally. // Any 4 planes to 1 with yuvconstants #define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \ const struct YuvConstants* yuvconstants, int width) { \ SIMD_ALIGNED(uint8_t temp[64 * 5]); \ memset(temp, 0, 64 * 4); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ } \ memcpy(temp, y_buf + n, r); \ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ memcpy(temp + 192, a_buf + n, r); \ if (width & 1) { \ temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \ temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ } \ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256, \ yuvconstants, MASK + 1); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256, \ SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_I444ALPHATOARGBROW_SSSE3 ANY41C(I444AlphaToARGBRow_Any_SSSE3, I444AlphaToARGBRow_SSSE3, 0, 0, 4, 7) #endif #ifdef HAS_I444ALPHATOARGBROW_AVX2 ANY41C(I444AlphaToARGBRow_Any_AVX2, I444AlphaToARGBRow_AVX2, 0, 0, 4, 15) #endif #ifdef HAS_I422ALPHATOARGBROW_SSSE3 ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7) #endif #ifdef HAS_I422ALPHATOARGBROW_AVX2 ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I444ALPHATOARGBROW_NEON ANY41C(I444AlphaToARGBRow_Any_NEON, I444AlphaToARGBRow_NEON, 0, 0, 4, 7) #endif #ifdef HAS_I422ALPHATOARGBROW_NEON ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7) #endif #ifdef HAS_I444ALPHATOARGBROW_MSA ANY41C(I444AlphaToARGBRow_Any_MSA, I444AlphaToARGBRow_MSA, 0, 0, 4, 7) #endif #ifdef HAS_I422ALPHATOARGBROW_MSA ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7) #endif #ifdef HAS_I444ALPHATOARGBROW_MMI ANY41C(I444AlphaToARGBRow_Any_MMI, I444AlphaToARGBRow_MMI, 0, 0, 4, 7) #endif #ifdef HAS_I422ALPHATOARGBROW_MMI ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7) #endif #undef ANY41C // Any 4 planes to 1 plane of 8 bit with yuvconstants #define ANY41CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \ uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ int width) { \ SIMD_ALIGNED(T temp[16 * 4]); \ SIMD_ALIGNED(uint8_t out[64]); \ memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ } \ memcpy(temp, y_buf + n, r * SBPP); \ memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ memcpy(temp + 48, a_buf + n, r * SBPP); \ ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, yuvconstants, \ MASK + 1); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_I210ALPHATOARGBROW_SSSE3 ANY41CT(I210AlphaToARGBRow_Any_SSSE3, I210AlphaToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_I210ALPHATOARGBROW_AVX2 ANY41CT(I210AlphaToARGBRow_Any_AVX2, I210AlphaToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_I410ALPHATOARGBROW_SSSE3 ANY41CT(I410AlphaToARGBRow_Any_SSSE3, I410AlphaToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_I410ALPHATOARGBROW_AVX2 ANY41CT(I410AlphaToARGBRow_Any_AVX2, I410AlphaToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif #undef ANY41CT // Any 4 planes to 1 plane with parameter #define ANY41PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ const STYPE* a_buf, DTYPE* dst_ptr, int depth, int width) { \ SIMD_ALIGNED(STYPE temp[16 * 4]); \ SIMD_ALIGNED(DTYPE out[64]); \ memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(r_buf, g_buf, b_buf, a_buf, dst_ptr, depth, n); \ } \ memcpy(temp, r_buf + n, r * SBPP); \ memcpy(temp + 16, g_buf + n, r * SBPP); \ memcpy(temp + 32, b_buf + n, r * SBPP); \ memcpy(temp + 48, a_buf + n, r * SBPP); \ ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, depth, MASK + 1); \ memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \ } #ifdef HAS_MERGEAR64ROW_AVX2 ANY41PT(MergeAR64Row_Any_AVX2, MergeAR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15) #endif #ifdef HAS_MERGEAR64ROW_NEON ANY41PT(MergeAR64Row_Any_NEON, MergeAR64Row_NEON, uint16_t, 2, uint16_t, 8, 7) #endif #ifdef HAS_MERGEARGB16TO8ROW_AVX2 ANY41PT(MergeARGB16To8Row_Any_AVX2, MergeARGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15) #endif #ifdef HAS_MERGEARGB16TO8ROW_NEON ANY41PT(MergeARGB16To8Row_Any_NEON, MergeARGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7) #endif #undef ANY41PT // Any 3 planes to 1. #define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \ SIMD_ALIGNED(uint8_t temp[64 * 4]); \ memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ } \ memcpy(temp, y_buf + n, r); \ memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ SS(r, DUVSHIFT) * BPP); \ } // Merge functions. #ifdef HAS_MERGERGBROW_SSSE3 ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15) #endif #ifdef HAS_MERGERGBROW_NEON ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15) #endif #ifdef HAS_MERGERGBROW_MMI ANY31(MergeRGBRow_Any_MMI, MergeRGBRow_MMI, 0, 0, 3, 7) #endif #ifdef HAS_MERGEXRGBROW_SSE2 ANY31(MergeXRGBRow_Any_SSE2, MergeXRGBRow_SSE2, 0, 0, 4, 7) #endif #ifdef HAS_MERGEXRGBROW_AVX2 ANY31(MergeXRGBRow_Any_AVX2, MergeXRGBRow_AVX2, 0, 0, 4, 15) #endif #ifdef HAS_MERGEXRGBROW_NEON ANY31(MergeXRGBRow_Any_NEON, MergeXRGBRow_NEON, 0, 0, 4, 15) #endif #ifdef HAS_I422TOYUY2ROW_SSE2 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) #endif #ifdef HAS_I422TOYUY2ROW_AVX2 ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31) ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31) #endif #ifdef HAS_I422TOYUY2ROW_NEON ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #endif #ifdef HAS_I422TOYUY2ROW_MSA ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31) #endif #ifdef HAS_I422TOYUY2ROW_MMI ANY31(I422ToYUY2Row_Any_MMI, I422ToYUY2Row_MMI, 1, 1, 4, 7) #endif #ifdef HAS_I422TOUYVYROW_NEON ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #endif #ifdef HAS_I422TOUYVYROW_MSA ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31) #endif #ifdef HAS_I422TOUYVYROW_MMI ANY31(I422ToUYVYRow_Any_MMI, I422ToUYVYRow_MMI, 1, 1, 4, 7) #endif #ifdef HAS_BLENDPLANEROW_AVX2 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31) #endif #ifdef HAS_BLENDPLANEROW_SSSE3 ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) #endif #ifdef HAS_BLENDPLANEROW_MMI ANY31(BlendPlaneRow_Any_MMI, BlendPlaneRow_MMI, 0, 0, 1, 7) #endif #undef ANY31 // Note that odd width replication includes 444 due to implementation // on arm that subsamples 444 to 422 internally. // Any 3 planes to 1 with yuvconstants #define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ const uint8_t* v_buf, uint8_t* dst_ptr, \ const struct YuvConstants* yuvconstants, int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 4]); \ memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ } \ memcpy(temp, y_buf + n, r); \ memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ if (width & 1) { \ temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \ temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \ } \ ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \ MASK + 1); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \ SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_I422TOARGBROW_SSSE3 ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) #endif #ifdef HAS_I422TORGBAROW_SSSE3 ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) #endif #ifdef HAS_I422TOARGB4444ROW_SSSE3 ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) #endif #ifdef HAS_I422TOARGB1555ROW_SSSE3 ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) #endif #ifdef HAS_I422TORGB565ROW_SSSE3 ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) #endif #ifdef HAS_I422TORGB24ROW_SSSE3 ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15) #endif #ifdef HAS_I422TOAR30ROW_SSSE3 ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) #endif #ifdef HAS_I422TOAR30ROW_AVX2 ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I444TOARGBROW_SSSE3 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) #endif #ifdef HAS_I422TORGB24ROW_AVX2 ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31) #endif #ifdef HAS_I422TOARGBROW_AVX2 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I422TORGBAROW_AVX2 ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I444TOARGBROW_AVX2 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) #endif #ifdef HAS_I422TOARGB4444ROW_AVX2 ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15) #endif #ifdef HAS_I422TOARGB1555ROW_AVX2 ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15) #endif #ifdef HAS_I422TORGB565ROW_AVX2 ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15) #endif #ifdef HAS_I422TOARGBROW_NEON ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7) ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7) ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) #endif #ifdef HAS_I422TOARGBROW_MSA ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7) ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7) ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7) ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15) ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) #endif #ifdef HAS_I422TOARGBROW_MMI ANY31C(I444ToARGBRow_Any_MMI, I444ToARGBRow_MMI, 0, 0, 4, 7) ANY31C(I422ToARGBRow_Any_MMI, I422ToARGBRow_MMI, 1, 0, 4, 7) ANY31C(I422ToRGB24Row_Any_MMI, I422ToRGB24Row_MMI, 1, 0, 3, 15) ANY31C(I422ToARGB4444Row_Any_MMI, I422ToARGB4444Row_MMI, 1, 0, 2, 7) ANY31C(I422ToARGB1555Row_Any_MMI, I422ToARGB1555Row_MMI, 1, 0, 2, 7) ANY31C(I422ToRGB565Row_Any_MMI, I422ToRGB565Row_MMI, 1, 0, 2, 7) ANY31C(I422ToRGBARow_Any_MMI, I422ToRGBARow_MMI, 1, 0, 4, 7) #endif #undef ANY31C // Any 3 planes of 16 bit to 1 with yuvconstants // TODO(fbarchard): consider sharing this code with ANY31C #define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, \ uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ int width) { \ SIMD_ALIGNED(T temp[16 * 3]); \ SIMD_ALIGNED(uint8_t out[64]); \ memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \ } \ memcpy(temp, y_buf + n, r * SBPP); \ memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_I210TOAR30ROW_SSSE3 ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_I210TOARGBROW_SSSE3 ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_I210TOARGBROW_AVX2 ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_I210TOAR30ROW_AVX2 ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_I410TOAR30ROW_SSSE3 ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_I410TOARGBROW_SSSE3 ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_I410TOARGBROW_AVX2 ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_I410TOAR30ROW_AVX2 ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_I210TOARGBROW_MMI ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_I212TOAR30ROW_SSSE3 ANY31CT(I212ToAR30Row_Any_SSSE3, I212ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_I212TOARGBROW_SSSE3 ANY31CT(I212ToARGBRow_Any_SSSE3, I212ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_I212TOARGBROW_AVX2 ANY31CT(I212ToARGBRow_Any_AVX2, I212ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_I212TOAR30ROW_AVX2 ANY31CT(I212ToAR30Row_Any_AVX2, I212ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif #undef ANY31CT // Any 3 planes to 1 plane with parameter #define ANY31PT(NAMEANY, ANY_SIMD, STYPE, SBPP, DTYPE, BPP, MASK) \ void NAMEANY(const STYPE* r_buf, const STYPE* g_buf, const STYPE* b_buf, \ DTYPE* dst_ptr, int depth, int width) { \ SIMD_ALIGNED(STYPE temp[16 * 3]); \ SIMD_ALIGNED(DTYPE out[64]); \ memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(r_buf, g_buf, b_buf, dst_ptr, depth, n); \ } \ memcpy(temp, r_buf + n, r * SBPP); \ memcpy(temp + 16, g_buf + n, r * SBPP); \ memcpy(temp + 32, b_buf + n, r * SBPP); \ ANY_SIMD(temp, temp + 16, temp + 32, out, depth, MASK + 1); \ memcpy((uint8_t*)dst_ptr + n * BPP, out, r * BPP); \ } #ifdef HAS_MERGEXR30ROW_AVX2 ANY31PT(MergeXR30Row_Any_AVX2, MergeXR30Row_AVX2, uint16_t, 2, uint8_t, 4, 15) #endif #ifdef HAS_MERGEXR30ROW_NEON ANY31PT(MergeXR30Row_Any_NEON, MergeXR30Row_NEON, uint16_t, 2, uint8_t, 4, 3) ANY31PT(MergeXR30Row_10_Any_NEON, MergeXR30Row_10_NEON, uint16_t, 2, uint8_t, 4, 3) #endif #ifdef HAS_MERGEXR64ROW_AVX2 ANY31PT(MergeXR64Row_Any_AVX2, MergeXR64Row_AVX2, uint16_t, 2, uint16_t, 8, 15) #endif #ifdef HAS_MERGEXR64ROW_NEON ANY31PT(MergeXR64Row_Any_NEON, MergeXR64Row_NEON, uint16_t, 2, uint16_t, 8, 7) #endif #ifdef HAS_MERGEXRGB16TO8ROW_AVX2 ANY31PT(MergeXRGB16To8Row_Any_AVX2, MergeXRGB16To8Row_AVX2, uint16_t, 2, uint8_t, 4, 15) #endif #ifdef HAS_MERGEXRGB16TO8ROW_NEON ANY31PT(MergeXRGB16To8Row_Any_NEON, MergeXRGB16To8Row_NEON, uint16_t, 2, uint8_t, 4, 7) #endif #undef ANY31PT // Any 2 planes to 1. #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 3]); \ memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ } \ memcpy(temp, y_buf + n * SBPP, r * SBPP); \ memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ SS(r, UVSHIFT) * SBPP2); \ ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \ } // Merge functions. #ifdef HAS_MERGEUVROW_SSE2 ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) #endif #ifdef HAS_MERGEUVROW_AVX2 ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) #endif #ifdef HAS_MERGEUVROW_NEON ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) #endif #ifdef HAS_MERGEUVROW_MSA ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15) #endif #ifdef HAS_MERGEUVROW_MMI ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7) #endif #ifdef HAS_NV21TOYUV24ROW_NEON ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15) #endif #ifdef HAS_NV21TOYUV24ROW_SSSE3 ANY21(NV21ToYUV24Row_Any_SSSE3, NV21ToYUV24Row_SSSE3, 1, 1, 2, 3, 15) #endif #ifdef HAS_NV21TOYUV24ROW_AVX2 ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31) #endif // Math functions. #ifdef HAS_ARGBMULTIPLYROW_SSE2 ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3) #endif #ifdef HAS_ARGBADDROW_SSE2 ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3) #endif #ifdef HAS_ARGBSUBTRACTROW_SSE2 ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3) #endif #ifdef HAS_ARGBMULTIPLYROW_AVX2 ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBADDROW_AVX2 ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBSUBTRACTROW_AVX2 ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBMULTIPLYROW_NEON ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBADDROW_NEON ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBSUBTRACTROW_NEON ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBMULTIPLYROW_MSA ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3) #endif #ifdef HAS_ARGBMULTIPLYROW_MMI ANY21(ARGBMultiplyRow_Any_MMI, ARGBMultiplyRow_MMI, 0, 4, 4, 4, 1) #endif #ifdef HAS_ARGBADDROW_MSA ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBADDROW_MMI ANY21(ARGBAddRow_Any_MMI, ARGBAddRow_MMI, 0, 4, 4, 4, 1) #endif #ifdef HAS_ARGBSUBTRACTROW_MSA ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7) #endif #ifdef HAS_ARGBSUBTRACTROW_MMI ANY21(ARGBSubtractRow_Any_MMI, ARGBSubtractRow_MMI, 0, 4, 4, 4, 1) #endif #ifdef HAS_SOBELROW_SSE2 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELROW_NEON ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) #endif #ifdef HAS_SOBELROW_MSA ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELROW_MMI ANY21(SobelRow_Any_MMI, SobelRow_MMI, 0, 1, 1, 4, 7) #endif #ifdef HAS_SOBELTOPLANEROW_SSE2 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) #endif #ifdef HAS_SOBELTOPLANEROW_NEON ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) #endif #ifdef HAS_SOBELTOPLANEROW_MSA ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31) #endif #ifdef HAS_SOBELTOPLANEROW_MMI ANY21(SobelToPlaneRow_Any_MMI, SobelToPlaneRow_MMI, 0, 1, 1, 1, 7) #endif #ifdef HAS_SOBELXYROW_SSE2 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELXYROW_NEON ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) #endif #ifdef HAS_SOBELXYROW_MSA ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15) #endif #ifdef HAS_SOBELXYROW_MMI ANY21(SobelXYRow_Any_MMI, SobelXYRow_MMI, 0, 1, 1, 4, 7) #endif #undef ANY21 // Any 2 planes to 1 with yuvconstants #define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ const struct YuvConstants* yuvconstants, int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 3]); \ memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ } \ memcpy(temp, y_buf + n * SBPP, r * SBPP); \ memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ SS(r, UVSHIFT) * SBPP2); \ ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1); \ memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \ } // Biplanar to RGB. #ifdef HAS_NV12TOARGBROW_SSSE3 ANY21C(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7) #endif #ifdef HAS_NV12TOARGBROW_AVX2 ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) #endif #ifdef HAS_NV12TOARGBROW_NEON ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) #endif #ifdef HAS_NV12TOARGBROW_MSA ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7) #endif #ifdef HAS_NV12TOARGBROW_MMI ANY21C(NV12ToARGBRow_Any_MMI, NV12ToARGBRow_MMI, 1, 1, 2, 4, 7) #endif #ifdef HAS_NV21TOARGBROW_SSSE3 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) #endif #ifdef HAS_NV21TOARGBROW_AVX2 ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) #endif #ifdef HAS_NV21TOARGBROW_NEON ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) #endif #ifdef HAS_NV21TOARGBROW_MSA ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7) #endif #ifdef HAS_NV21TOARGBROW_MMI ANY21C(NV21ToARGBRow_Any_MMI, NV21ToARGBRow_MMI, 1, 1, 2, 4, 7) #endif #ifdef HAS_NV12TORGB24ROW_NEON ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7) #endif #ifdef HAS_NV21TORGB24ROW_NEON ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7) #endif #ifdef HAS_NV12TORGB24ROW_SSSE3 ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) #endif #ifdef HAS_NV12TORGB24ROW_MMI ANY21C(NV12ToRGB24Row_Any_MMI, NV12ToRGB24Row_MMI, 1, 1, 2, 3, 7) #endif #ifdef HAS_NV21TORGB24ROW_SSSE3 ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15) #endif #ifdef HAS_NV12TORGB24ROW_AVX2 ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31) #endif #ifdef HAS_NV21TORGB24ROW_AVX2 ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31) #endif #ifdef HAS_NV21TORGB24ROW_MMI ANY21C(NV21ToRGB24Row_Any_MMI, NV21ToRGB24Row_MMI, 1, 1, 2, 3, 7) #endif #ifdef HAS_NV12TORGB565ROW_SSSE3 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) #endif #ifdef HAS_NV12TORGB565ROW_AVX2 ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) #endif #ifdef HAS_NV12TORGB565ROW_NEON ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) #endif #ifdef HAS_NV12TORGB565ROW_MSA ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7) #endif #ifdef HAS_NV12TORGB565ROW_MMI ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7) #endif #undef ANY21C // Any 2 planes of 16 bit to 1 with yuvconstants #define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr, \ const struct YuvConstants* yuvconstants, int width) { \ SIMD_ALIGNED(T temp[16 * 3]); \ SIMD_ALIGNED(uint8_t out[64]); \ memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ } \ memcpy(temp, y_buf + n, r * SBPP); \ memcpy(temp + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \ ANY_SIMD(temp, temp + 16, out, yuvconstants, MASK + 1); \ memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_P210TOAR30ROW_SSSE3 ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_P210TOARGBROW_SSSE3 ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_P210TOARGBROW_AVX2 ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_P210TOAR30ROW_AVX2 ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_P410TOAR30ROW_SSSE3 ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_P410TOARGBROW_SSSE3 ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) #endif #ifdef HAS_P410TOARGBROW_AVX2 ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif #ifdef HAS_P410TOAR30ROW_AVX2 ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) #endif #undef ANY21CT // Any 2 16 bit planes with parameter to 1 #define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \ int width) { \ SIMD_ALIGNED(T temp[16 * 4]); \ memset(temp, 0, 16 * 4 * BPP); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_u, src_v, dst_uv, depth, n); \ } \ memcpy(temp, src_u + n, r * BPP); \ memcpy(temp + 16, src_v + n, r * BPP); \ ANY_SIMD(temp, temp + 16, temp + 32, depth, MASK + 1); \ memcpy(dst_uv + n * 2, temp + 32, r * BPP * 2); \ } #ifdef HAS_MERGEUVROW_16_AVX2 ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15) #endif #ifdef HAS_MERGEUVROW_16_NEON ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7) #endif #undef ANY21CT // Any 1 to 1. #define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 2]); \ memset(temp, 0, 128); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ ANY_SIMD(temp, temp + 128, MASK + 1); \ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } #ifdef HAS_COPYROW_AVX ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) #endif #ifdef HAS_COPYROW_SSE2 ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31) #endif #ifdef HAS_COPYROW_NEON ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31) #endif #if defined(HAS_ARGBTORGB24ROW_SSSE3) ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15) ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15) ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) #endif #if defined(HAS_ARGBTORGB24ROW_AVX2) ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31) #endif #if defined(HAS_ARGBTORGB24ROW_AVX512VBMI) ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31) #endif #if defined(HAS_ARGBTORAWROW_AVX2) ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31) #endif #if defined(HAS_ARGBTORGB565ROW_AVX2) ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) #endif #if defined(HAS_ARGBTOARGB4444ROW_AVX2) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #endif #if defined(HAS_ABGRTOAR30ROW_SSSE3) ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3) #endif #if defined(HAS_ARGBTOAR30ROW_SSSE3) ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) #endif #if defined(HAS_ABGRTOAR30ROW_AVX2) ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7) #endif #if defined(HAS_ARGBTOAR30ROW_AVX2) ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) #endif #if defined(HAS_J400TOARGBROW_SSE2) ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) #endif #if defined(HAS_J400TOARGBROW_AVX2) ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) #endif #if defined(HAS_RGB24TOARGBROW_SSSE3) ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) #endif #if defined(HAS_RAWTORGBAROW_SSSE3) ANY11(RAWToRGBARow_Any_SSSE3, RAWToRGBARow_SSSE3, 0, 3, 4, 15) #endif #if defined(HAS_RAWTORGB24ROW_SSSE3) ANY11(RAWToRGB24Row_Any_SSSE3, RAWToRGB24Row_SSSE3, 0, 3, 3, 7) #endif #if defined(HAS_RGB565TOARGBROW_AVX2) ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15) #endif #if defined(HAS_ARGB1555TOARGBROW_AVX2) ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15) #endif #if defined(HAS_ARGB4444TOARGBROW_AVX2) ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15) #endif #if defined(HAS_ARGBTORGB24ROW_NEON) ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 15) ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7) ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) #endif #if defined(HAS_ARGBTORGB24ROW_MSA) ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15) ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15) ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) #endif #if defined(HAS_ARGBTORGB24ROW_MMI) ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3) ANY11(ARGBToRAWRow_Any_MMI, ARGBToRAWRow_MMI, 0, 4, 3, 3) ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3) ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3) ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3) ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3) #endif #if defined(HAS_RAWTORGB24ROW_NEON) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) #endif #if defined(HAS_RAWTORGB24ROW_MSA) ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15) #endif #if defined(HAS_RAWTORGB24ROW_MMI) ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3) #endif #ifdef HAS_ARGBTOYROW_AVX2 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) #endif #ifdef HAS_ABGRTOYROW_AVX2 ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31) #endif #ifdef HAS_ARGBTOYJROW_AVX2 ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) #endif #ifdef HAS_RGBATOYJROW_AVX2 ANY11(RGBAToYJRow_Any_AVX2, RGBAToYJRow_AVX2, 0, 4, 1, 31) #endif #ifdef HAS_UYVYTOYROW_AVX2 ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31) #endif #ifdef HAS_YUY2TOYROW_AVX2 ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31) #endif #ifdef HAS_ARGBTOYROW_SSSE3 ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) #endif #ifdef HAS_BGRATOYROW_SSSE3 ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15) #endif #ifdef HAS_YUY2TOYROW_SSE2 ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15) ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) #endif #ifdef HAS_ARGBTOYJROW_SSSE3 ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) #endif #ifdef HAS_RGBATOYJROW_SSSE3 ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15) #endif #ifdef HAS_ARGBTOYROW_NEON ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_ARGBTOYROW_MSA ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15) #endif #ifdef HAS_ARGBTOYROW_MMI ANY11(ARGBToYRow_Any_MMI, ARGBToYRow_MMI, 0, 4, 1, 7) #endif #ifdef HAS_ARGBTOYJROW_NEON ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_RGBATOYJROW_NEON ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_ARGBTOYJROW_MSA ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15) #endif #ifdef HAS_ARGBTOYJROW_MMI ANY11(ARGBToYJRow_Any_MMI, ARGBToYJRow_MMI, 0, 4, 1, 7) #endif #ifdef HAS_BGRATOYROW_NEON ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_BGRATOYROW_MSA ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15) #endif #ifdef HAS_BGRATOYROW_MMI ANY11(BGRAToYRow_Any_MMI, BGRAToYRow_MMI, 0, 4, 1, 7) #endif #ifdef HAS_ABGRTOYROW_NEON ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_ABGRTOYROW_MSA ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7) #endif #ifdef HAS_ABGRTOYROW_MMI ANY11(ABGRToYRow_Any_MMI, ABGRToYRow_MMI, 0, 4, 1, 7) #endif #ifdef HAS_RGBATOYROW_NEON ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_RGBATOYROW_MSA ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15) #endif #ifdef HAS_RGBATOYROW_MMI ANY11(RGBAToYRow_Any_MMI, RGBAToYRow_MMI, 0, 4, 1, 7) #endif #ifdef HAS_RGB24TOYROW_NEON ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) #endif #ifdef HAS_RGB24TOYJROW_AVX2 ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31) #endif #ifdef HAS_RGB24TOYJROW_SSSE3 ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYJROW_NEON ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 7) #endif #ifdef HAS_RGB24TOYROW_MSA ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15) #endif #ifdef HAS_RGB24TOYROW_MMI ANY11(RGB24ToYRow_Any_MMI, RGB24ToYRow_MMI, 0, 3, 1, 7) #endif #ifdef HAS_RAWTOYROW_NEON ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) #endif #ifdef HAS_RAWTOYJROW_AVX2 ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31) #endif #ifdef HAS_RAWTOYJROW_SSSE3 ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYJROW_NEON ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 7) #endif #ifdef HAS_RAWTOYROW_MSA ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15) #endif #ifdef HAS_RAWTOYROW_MMI ANY11(RAWToYRow_Any_MMI, RAWToYRow_MMI, 0, 3, 1, 7) #endif #ifdef HAS_RGB565TOYROW_NEON ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) #endif #ifdef HAS_RGB565TOYROW_MSA ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15) #endif #ifdef HAS_RGB565TOYROW_MMI ANY11(RGB565ToYRow_Any_MMI, RGB565ToYRow_MMI, 0, 2, 1, 7) #endif #ifdef HAS_ARGB1555TOYROW_NEON ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) #endif #ifdef HAS_ARGB1555TOYROW_MSA ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15) #endif #ifdef HAS_ARGB1555TOYROW_MMI ANY11(ARGB1555ToYRow_Any_MMI, ARGB1555ToYRow_MMI, 0, 2, 1, 7) #endif #ifdef HAS_ARGB4444TOYROW_NEON ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) #endif #ifdef HAS_ARGB4444TOYROW_MMI ANY11(ARGB4444ToYRow_Any_MMI, ARGB4444ToYRow_MMI, 0, 2, 1, 7) #endif #ifdef HAS_YUY2TOYROW_NEON ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) #endif #ifdef HAS_UYVYTOYROW_NEON ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15) #endif #ifdef HAS_YUY2TOYROW_MSA ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31) #endif #ifdef HAS_YUY2TOYROW_MMI ANY11(YUY2ToYRow_Any_MMI, YUY2ToYRow_MMI, 1, 4, 1, 7) #endif #ifdef HAS_UYVYTOYROW_MSA ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #endif #ifdef HAS_UYVYTOYROW_MMI ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15) #endif #ifdef HAS_AYUVTOYROW_NEON ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_SWAPUVROW_SSSE3 ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15) #endif #ifdef HAS_SWAPUVROW_AVX2 ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31) #endif #ifdef HAS_SWAPUVROW_NEON ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15) #endif #ifdef HAS_RGB24TOARGBROW_NEON ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) #endif #ifdef HAS_RGB24TOARGBROW_MSA ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15) #endif #ifdef HAS_RGB24TOARGBROW_MMI ANY11(RGB24ToARGBRow_Any_MMI, RGB24ToARGBRow_MMI, 0, 3, 4, 3) #endif #ifdef HAS_RAWTOARGBROW_NEON ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) #endif #ifdef HAS_RAWTORGBAROW_NEON ANY11(RAWToRGBARow_Any_NEON, RAWToRGBARow_NEON, 0, 3, 4, 7) #endif #ifdef HAS_RAWTOARGBROW_MSA ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15) #endif #ifdef HAS_RAWTOARGBROW_MMI ANY11(RAWToARGBRow_Any_MMI, RAWToARGBRow_MMI, 0, 3, 4, 3) #endif #ifdef HAS_RGB565TOARGBROW_NEON ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) #endif #ifdef HAS_RGB565TOARGBROW_MSA ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15) #endif #ifdef HAS_RGB565TOARGBROW_MMI ANY11(RGB565ToARGBRow_Any_MMI, RGB565ToARGBRow_MMI, 0, 2, 4, 3) #endif #ifdef HAS_ARGB1555TOARGBROW_NEON ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) #endif #ifdef HAS_ARGB1555TOARGBROW_MSA ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15) #endif #ifdef HAS_ARGB1555TOARGBROW_MMI ANY11(ARGB1555ToARGBRow_Any_MMI, ARGB1555ToARGBRow_MMI, 0, 2, 4, 3) #endif #ifdef HAS_ARGB4444TOARGBROW_NEON ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) #endif #ifdef HAS_ARGB4444TOARGBROW_MSA ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15) #endif #ifdef HAS_ARGB4444TOARGBROW_MMI ANY11(ARGB4444ToARGBRow_Any_MMI, ARGB4444ToARGBRow_MMI, 0, 2, 4, 3) #endif #ifdef HAS_ARGBATTENUATEROW_SSSE3 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) #endif #ifdef HAS_ARGBUNATTENUATEROW_SSE2 ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) #endif #ifdef HAS_ARGBATTENUATEROW_AVX2 ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7) #endif #ifdef HAS_ARGBUNATTENUATEROW_AVX2 ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7) #endif #ifdef HAS_ARGBATTENUATEROW_NEON ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #endif #ifdef HAS_ARGBATTENUATEROW_MSA ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7) #endif #ifdef HAS_ARGBATTENUATEROW_MMI ANY11(ARGBAttenuateRow_Any_MMI, ARGBAttenuateRow_MMI, 0, 4, 4, 1) #endif #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) #endif #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31) #endif #ifdef HAS_ARGBEXTRACTALPHAROW_NEON ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) #endif #ifdef HAS_ARGBEXTRACTALPHAROW_MSA ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15) #endif #ifdef HAS_ARGBEXTRACTALPHAROW_MMI ANY11(ARGBExtractAlphaRow_Any_MMI, ARGBExtractAlphaRow_MMI, 0, 4, 1, 7) #endif #undef ANY11 // Any 1 to 1 blended. Destination is read, modify, write. #define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ SIMD_ALIGNED(uint8_t temp[64 * 2]); \ memset(temp, 0, 64 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ memcpy(temp + 64, dst_ptr + n * BPP, r * BPP); \ ANY_SIMD(temp, temp + 64, MASK + 1); \ memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ } #ifdef HAS_ARGBCOPYALPHAROW_AVX2 ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15) #endif #ifdef HAS_ARGBCOPYALPHAROW_SSE2 ANY11B(ARGBCopyAlphaRow_Any_SSE2, ARGBCopyAlphaRow_SSE2, 0, 4, 4, 7) #endif #ifdef HAS_ARGBCOPYALPHAROW_MMI ANY11B(ARGBCopyAlphaRow_Any_MMI, ARGBCopyAlphaRow_MMI, 0, 4, 4, 1) #endif #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 ANY11B(ARGBCopyYToAlphaRow_Any_AVX2, ARGBCopyYToAlphaRow_AVX2, 0, 1, 4, 15) #endif #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7) #endif #ifdef HAS_ARGBCOPYYTOALPHAROW_MMI ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7) #endif #undef ANY11B // Any 1 to 1 with parameter. #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \ SIMD_ALIGNED(uint8_t temp[64 * 2]); \ memset(temp, 0, 64); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, param, n); \ } \ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ ANY_SIMD(temp, temp + 64, param, MASK + 1); \ memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ } #if defined(HAS_I400TOARGBROW_SSE2) ANY11P(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, const struct YuvConstants*, 1, 4, 7) #endif #if defined(HAS_I400TOARGBROW_AVX2) ANY11P(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, const struct YuvConstants*, 1, 4, 15) #endif #if defined(HAS_I400TOARGBROW_NEON) ANY11P(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, const struct YuvConstants*, 1, 4, 7) #endif #if defined(HAS_I400TOARGBROW_MSA) ANY11P(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, const struct YuvConstants*, 1, 4, 15) #endif #if defined(HAS_I400TOARGBROW_MMI) ANY11P(I400ToARGBRow_Any_MMI, I400ToARGBRow_MMI, const struct YuvConstants*, 1, 4, 7) #endif #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, const uint32_t, 4, 2, 3) #endif #if defined(HAS_ARGBTORGB565DITHERROW_AVX2) ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2, const uint32_t, 4, 2, 7) #endif #if defined(HAS_ARGBTORGB565DITHERROW_NEON) ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON, const uint32_t, 4, 2, 7) #endif #if defined(HAS_ARGBTORGB565DITHERROW_MSA) ANY11P(ARGBToRGB565DitherRow_Any_MSA, ARGBToRGB565DitherRow_MSA, const uint32_t, 4, 2, 7) #endif #if defined(HAS_ARGBTORGB565DITHERROW_MMI) ANY11P(ARGBToRGB565DitherRow_Any_MMI, ARGBToRGB565DitherRow_MMI, const uint32_t, 4, 2, 3) #endif #ifdef HAS_ARGBSHUFFLEROW_SSSE3 ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7) #endif #ifdef HAS_ARGBSHUFFLEROW_AVX2 ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15) #endif #ifdef HAS_ARGBSHUFFLEROW_NEON ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3) #endif #ifdef HAS_ARGBSHUFFLEROW_MSA ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7) #endif #ifdef HAS_ARGBSHUFFLEROW_MMI ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1) #endif #undef ANY11P #undef ANY11P // Any 1 to 1 with type #define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \ SIMD_ALIGNED(uint8_t temp[(MASK + 1) * SBPP]); \ SIMD_ALIGNED(uint8_t out[(MASK + 1) * BPP]); \ memset(temp, 0, (MASK + 1) * SBPP); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, n); \ } \ memcpy(temp, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP); \ ANY_SIMD((STYPE*)temp, (DTYPE*)out, MASK + 1); \ memcpy((uint8_t*)(dst_ptr) + n * BPP, out, r * BPP); \ } #ifdef HAS_ARGBTOAR64ROW_SSSE3 ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) #endif #ifdef HAS_ARGBTOAB64ROW_SSSE3 ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) #endif #ifdef HAS_AR64TOARGBROW_SSSE3 ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) #endif #ifdef HAS_ARGBTOAR64ROW_SSSE3 ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) #endif #ifdef HAS_ARGBTOAR64ROW_AVX2 ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) #endif #ifdef HAS_ARGBTOAB64ROW_AVX2 ANY11T(ARGBToAB64Row_Any_AVX2, ARGBToAB64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) #endif #ifdef HAS_AR64TOARGBROW_AVX2 ANY11T(AR64ToARGBRow_Any_AVX2, AR64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) #endif #ifdef HAS_ARGBTOAR64ROW_AVX2 ANY11T(AB64ToARGBRow_Any_AVX2, AB64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) #endif #ifdef HAS_ARGBTOAR64ROW_NEON ANY11T(ARGBToAR64Row_Any_NEON, ARGBToAR64Row_NEON, 4, 8, uint8_t, uint16_t, 7) #endif #ifdef HAS_ARGBTOAB64ROW_NEON ANY11T(ARGBToAB64Row_Any_NEON, ARGBToAB64Row_NEON, 4, 8, uint8_t, uint16_t, 7) #endif #ifdef HAS_AR64TOARGBROW_NEON ANY11T(AR64ToARGBRow_Any_NEON, AR64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) #endif #ifdef HAS_ARGBTOAR64ROW_NEON ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) #endif #undef ANY11T // Any 1 to 1 with parameter and shorts. BPP measures in shorts. #define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ SIMD_ALIGNED(STYPE temp[32]); \ SIMD_ALIGNED(DTYPE out[32]); \ memset(temp, 0, 32 * SBPP); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, scale, n); \ } \ memcpy(temp, src_ptr + n, r * SBPP); \ ANY_SIMD(temp, out, scale, MASK + 1); \ memcpy(dst_ptr + n, out, r * BPP); \ } #ifdef HAS_CONVERT16TO8ROW_SSSE3 ANY11C(Convert16To8Row_Any_SSSE3, Convert16To8Row_SSSE3, 2, 1, uint16_t, uint8_t, 15) #endif #ifdef HAS_CONVERT16TO8ROW_AVX2 ANY11C(Convert16To8Row_Any_AVX2, Convert16To8Row_AVX2, 2, 1, uint16_t, uint8_t, 31) #endif #ifdef HAS_CONVERT8TO16ROW_SSE2 ANY11C(Convert8To16Row_Any_SSE2, Convert8To16Row_SSE2, 1, 2, uint8_t, uint16_t, 15) #endif #ifdef HAS_CONVERT8TO16ROW_AVX2 ANY11C(Convert8To16Row_Any_AVX2, Convert8To16Row_AVX2, 1, 2, uint8_t, uint16_t, 31) #endif #ifdef HAS_MULTIPLYROW_16_AVX2 ANY11C(MultiplyRow_16_Any_AVX2, MultiplyRow_16_AVX2, 2, 2, uint16_t, uint16_t, 31) #endif #ifdef HAS_MULTIPLYROW_16_NEON ANY11C(MultiplyRow_16_Any_NEON, MultiplyRow_16_NEON, 2, 2, uint16_t, uint16_t, 15) #endif #ifdef HAS_DIVIDEROW_16_AVX2 ANY11C(DivideRow_16_Any_AVX2, DivideRow_16_AVX2, 2, 2, uint16_t, uint16_t, 31) #endif #ifdef HAS_DIVIDEROW_16_NEON ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15) #endif #undef ANY11C // Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. #define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \ void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \ SIMD_ALIGNED(ST temp[32]); \ SIMD_ALIGNED(T out[32]); \ memset(temp, 0, SBPP * 32); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, param, n); \ } \ memcpy(temp, src_ptr + n, r * SBPP); \ ANY_SIMD(temp, out, param, MASK + 1); \ memcpy(dst_ptr + n, out, r * BPP); \ } #ifdef HAS_HALFFLOATROW_SSE2 ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7) #endif #ifdef HAS_HALFFLOATROW_AVX2 ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15) #endif #ifdef HAS_HALFFLOATROW_F16C ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15) ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, uint16_t, uint16_t, 2, 2, 15) #endif #ifdef HAS_HALFFLOATROW_NEON ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7) ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, uint16_t, uint16_t, 2, 2, 7) #endif #ifdef HAS_HALFFLOATROW_MSA ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31) #endif #ifdef HAS_BYTETOFLOATROW_NEON ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7) #endif #undef ANY11P16 // Any 1 to 1 with yuvconstants #define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, \ const struct YuvConstants* yuvconstants, int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 2]); \ memset(temp, 0, 128); /* for YUY2 and msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1); \ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } #if defined(HAS_YUY2TOARGBROW_SSSE3) ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) #endif #if defined(HAS_YUY2TOARGBROW_AVX2) ANY11C(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31) ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) #endif #if defined(HAS_YUY2TOARGBROW_NEON) ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) #endif #if defined(HAS_YUY2TOARGBROW_MSA) ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7) ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7) #endif #if defined(HAS_YUY2TOARGBROW_MMI) ANY11C(YUY2ToARGBRow_Any_MMI, YUY2ToARGBRow_MMI, 1, 4, 4, 7) ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7) #endif #undef ANY11C // Any 1 to 1 interpolate. Takes 2 rows of source via stride. #define ANY11I(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, \ int width, int source_y_fraction) { \ SIMD_ALIGNED(uint8_t temp[64 * 3]); \ memset(temp, 0, 64 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(dst_ptr, src_ptr, src_stride, n, source_y_fraction); \ } \ memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ memcpy(temp + 64, src_ptr + src_stride + n * SBPP, r * SBPP); \ ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } #ifdef HAS_INTERPOLATEROW_AVX2 ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) #endif #ifdef HAS_INTERPOLATEROW_SSSE3 ANY11I(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_NEON ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_MSA ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) #endif #ifdef HAS_INTERPOLATEROW_MMI ANY11I(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7) #endif #undef ANY11I // Any 1 to 1 mirror. #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ SIMD_ALIGNED(uint8_t temp[64 * 2]); \ memset(temp, 0, 64); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ } \ memcpy(temp, src_ptr, r* BPP); \ ANY_SIMD(temp, temp + 64, MASK + 1); \ memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ } #ifdef HAS_MIRRORROW_AVX2 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) #endif #ifdef HAS_MIRRORROW_SSSE3 ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) #endif #ifdef HAS_MIRRORROW_NEON ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31) #endif #ifdef HAS_MIRRORROW_MSA ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) #endif #ifdef HAS_MIRRORROW_MMI ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7) #endif #ifdef HAS_MIRRORUVROW_AVX2 ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) #endif #ifdef HAS_MIRRORUVROW_SSSE3 ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7) #endif #ifdef HAS_MIRRORUVROW_NEON ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) #endif #ifdef HAS_MIRRORUVROW_MSA ANY11M(MirrorUVRow_Any_MSA, MirrorUVRow_MSA, 2, 7) #endif #ifdef HAS_ARGBMIRRORROW_AVX2 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif #ifdef HAS_ARGBMIRRORROW_SSE2 ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) #endif #ifdef HAS_ARGBMIRRORROW_NEON ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7) #endif #ifdef HAS_ARGBMIRRORROW_MSA ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) #endif #ifdef HAS_ARGBMIRRORROW_MMI ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1) #endif #ifdef HAS_RGB24MIRRORROW_SSSE3 ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15) #endif #ifdef HAS_RGB24MIRRORROW_NEON ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15) #endif #undef ANY11M // Any 1 plane. (memset) #define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \ SIMD_ALIGNED(uint8_t temp[64]); \ memset(temp, 0, 64); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(dst_ptr, v32, n); \ } \ ANY_SIMD(temp, v32, MASK + 1); \ memcpy(dst_ptr + n * BPP, temp, r * BPP); \ } #ifdef HAS_SETROW_X86 ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3) #endif #ifdef HAS_SETROW_NEON ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15) #endif #ifdef HAS_ARGBSETROW_NEON ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3) #endif #ifdef HAS_ARGBSETROW_MSA ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3) #endif #ifdef HAS_ARGBSETROW_MMI ANY1(ARGBSetRow_Any_MMI, ARGBSetRow_MMI, uint32_t, 4, 3) #endif #undef ANY1 // Any 1 to 2. Outputs UV planes. #define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \ int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 3]); \ memset(temp, 0, 128); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_u, dst_v, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ } #ifdef HAS_SPLITUVROW_SSE2 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) #endif #ifdef HAS_SPLITUVROW_AVX2 ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) #endif #ifdef HAS_SPLITUVROW_NEON ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) #endif #ifdef HAS_SPLITUVROW_MSA ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31) #endif #ifdef HAS_SPLITUVROW_MMI ANY12(SplitUVRow_Any_MMI, SplitUVRow_MMI, 0, 2, 0, 7) #endif #ifdef HAS_ARGBTOUV444ROW_SSSE3 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) #endif #ifdef HAS_YUY2TOUV422ROW_AVX2 ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) #endif #ifdef HAS_YUY2TOUV422ROW_SSE2 ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) #endif #ifdef HAS_YUY2TOUV422ROW_NEON ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) #endif #ifdef HAS_YUY2TOUV422ROW_MSA ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15) ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31) ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31) #endif #ifdef HAS_YUY2TOUV422ROW_MMI ANY12(ARGBToUV444Row_Any_MMI, ARGBToUV444Row_MMI, 0, 4, 0, 7) ANY12(UYVYToUV422Row_Any_MMI, UYVYToUV422Row_MMI, 1, 4, 1, 15) ANY12(YUY2ToUV422Row_Any_MMI, YUY2ToUV422Row_MMI, 1, 4, 1, 15) #endif #undef ANY12 // Any 2 16 bit planes with parameter to 1 #define ANY12PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ void NAMEANY(const T* src_uv, T* dst_u, T* dst_v, int depth, int width) { \ SIMD_ALIGNED(T temp[16 * 4]); \ memset(temp, 0, 16 * 4 * BPP); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_uv, dst_u, dst_v, depth, n); \ } \ memcpy(temp, src_uv + n * 2, r * BPP * 2); \ ANY_SIMD(temp, temp + 32, temp + 48, depth, MASK + 1); \ memcpy(dst_u + n, temp + 32, r * BPP); \ memcpy(dst_v + n, temp + 48, r * BPP); \ } #ifdef HAS_SPLITUVROW_16_AVX2 ANY12PT(SplitUVRow_16_Any_AVX2, SplitUVRow_16_AVX2, uint16_t, 2, 15) #endif #ifdef HAS_SPLITUVROW_16_NEON ANY12PT(SplitUVRow_16_Any_NEON, SplitUVRow_16_NEON, uint16_t, 2, 7) #endif #undef ANY21CT // Any 1 to 3. Outputs RGB planes. #define ANY13(NAMEANY, ANY_SIMD, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ uint8_t* dst_b, int width) { \ SIMD_ALIGNED(uint8_t temp[16 * 6]); \ memset(temp, 0, 16 * 3); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n); \ } \ memcpy(temp, src_ptr + n * BPP, r * BPP); \ ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \ memcpy(dst_r + n, temp + 16 * 3, r); \ memcpy(dst_g + n, temp + 16 * 4, r); \ memcpy(dst_b + n, temp + 16 * 5, r); \ } #ifdef HAS_SPLITRGBROW_SSSE3 ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15) #endif #ifdef HAS_SPLITRGBROW_NEON ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15) #endif #ifdef HAS_SPLITRGBROW_MMI ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3) #endif #ifdef HAS_SPLITXRGBROW_SSE2 ANY13(SplitXRGBRow_Any_SSE2, SplitXRGBRow_SSE2, 4, 7) #endif #ifdef HAS_SPLITXRGBROW_SSSE3 ANY13(SplitXRGBRow_Any_SSSE3, SplitXRGBRow_SSSE3, 4, 7) #endif #ifdef HAS_SPLITXRGBROW_AVX2 ANY13(SplitXRGBRow_Any_AVX2, SplitXRGBRow_AVX2, 4, 15) #endif #ifdef HAS_SPLITXRGBROW_NEON ANY13(SplitXRGBRow_Any_NEON, SplitXRGBRow_NEON, 4, 15) #endif // Any 1 to 4. Outputs ARGB planes. #define ANY14(NAMEANY, ANY_SIMD, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g, \ uint8_t* dst_b, uint8_t* dst_a, int width) { \ SIMD_ALIGNED(uint8_t temp[16 * 8]); \ memset(temp, 0, 16 * 4); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, dst_a, n); \ } \ memcpy(temp, src_ptr + n * BPP, r * BPP); \ ANY_SIMD(temp, temp + 16 * 4, temp + 16 * 5, temp + 16 * 6, temp + 16 * 7, \ MASK + 1); \ memcpy(dst_r + n, temp + 16 * 4, r); \ memcpy(dst_g + n, temp + 16 * 5, r); \ memcpy(dst_b + n, temp + 16 * 6, r); \ memcpy(dst_a + n, temp + 16 * 7, r); \ } #ifdef HAS_SPLITARGBROW_SSE2 ANY14(SplitARGBRow_Any_SSE2, SplitARGBRow_SSE2, 4, 7) #endif #ifdef HAS_SPLITARGBROW_SSSE3 ANY14(SplitARGBRow_Any_SSSE3, SplitARGBRow_SSSE3, 4, 7) #endif #ifdef HAS_SPLITARGBROW_AVX2 ANY14(SplitARGBRow_Any_AVX2, SplitARGBRow_AVX2, 4, 15) #endif #ifdef HAS_SPLITARGBROW_NEON ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) #endif // Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. // 128 byte row allows for 32 avx ARGB pixels. #define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ uint8_t* dst_v, int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 4]); \ memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, src_stride, dst_u, dst_v, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ BPP); \ memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ } \ ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ } #ifdef HAS_ARGBTOUVROW_AVX2 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) #endif #ifdef HAS_ABGRTOUVROW_AVX2 ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31) #endif #ifdef HAS_ARGBTOUVJROW_AVX2 ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31) #endif #ifdef HAS_ARGBTOUVROW_SSSE3 ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) #endif #ifdef HAS_YUY2TOUVROW_AVX2 ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31) ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31) #endif #ifdef HAS_YUY2TOUVROW_SSE2 ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15) ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) #endif #ifdef HAS_ARGBTOUVROW_NEON ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_ARGBTOUVROW_MSA ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31) #endif #ifdef HAS_ARGBTOUVROW_MMI ANY12S(ARGBToUVRow_Any_MMI, ARGBToUVRow_MMI, 0, 4, 15) #endif #ifdef HAS_ARGBTOUVJROW_NEON ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif #ifdef HAS_ARGBTOUVJROW_MSA ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31) #endif #ifdef HAS_ARGBTOUVJROW_MMI ANY12S(ARGBToUVJRow_Any_MMI, ARGBToUVJRow_MMI, 0, 4, 15) #endif #ifdef HAS_BGRATOUVROW_NEON ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_BGRATOUVROW_MSA ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 15) #endif #ifdef HAS_BGRATOUVROW_MMI ANY12S(BGRAToUVRow_Any_MMI, BGRAToUVRow_MMI, 0, 4, 15) #endif #ifdef HAS_ABGRTOUVROW_NEON ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_ABGRTOUVROW_MSA ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 15) #endif #ifdef HAS_ABGRTOUVROW_MMI ANY12S(ABGRToUVRow_Any_MMI, ABGRToUVRow_MMI, 0, 4, 15) #endif #ifdef HAS_RGBATOUVROW_NEON ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif #ifdef HAS_RGBATOUVROW_MSA ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 15) #endif #ifdef HAS_RGBATOUVROW_MMI ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15) #endif #ifdef HAS_RGB24TOUVROW_NEON ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) #endif #ifdef HAS_RGB24TOUVJROW_NEON ANY12S(RGB24ToUVJRow_Any_NEON, RGB24ToUVJRow_NEON, 0, 3, 15) #endif #ifdef HAS_RGB24TOUVROW_MSA ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15) #endif #ifdef HAS_RGB24TOUVROW_MMI ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15) #endif #ifdef HAS_RAWTOUVROW_NEON ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) #endif #ifdef HAS_RAWTOUVJROW_NEON ANY12S(RAWToUVJRow_Any_NEON, RAWToUVJRow_NEON, 0, 3, 15) #endif #ifdef HAS_RAWTOUVROW_MSA ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15) #endif #ifdef HAS_RAWTOUVROW_MMI ANY12S(RAWToUVRow_Any_MMI, RAWToUVRow_MMI, 0, 3, 15) #endif #ifdef HAS_RGB565TOUVROW_NEON ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) #endif #ifdef HAS_RGB565TOUVROW_MSA ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15) #endif #ifdef HAS_RGB565TOUVROW_MMI ANY12S(RGB565ToUVRow_Any_MMI, RGB565ToUVRow_MMI, 0, 2, 15) #endif #ifdef HAS_ARGB1555TOUVROW_NEON ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) #endif #ifdef HAS_ARGB1555TOUVROW_MSA ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15) #endif #ifdef HAS_ARGB1555TOUVROW_MMI ANY12S(ARGB1555ToUVRow_Any_MMI, ARGB1555ToUVRow_MMI, 0, 2, 15) #endif #ifdef HAS_ARGB4444TOUVROW_NEON ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) #endif #ifdef HAS_ARGB4444TOUVROW_MMI ANY12S(ARGB4444ToUVRow_Any_MMI, ARGB4444ToUVRow_MMI, 0, 2, 15) #endif #ifdef HAS_YUY2TOUVROW_NEON ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) #endif #ifdef HAS_UYVYTOUVROW_NEON ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) #endif #ifdef HAS_YUY2TOUVROW_MSA ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31) #endif #ifdef HAS_YUY2TOUVROW_MMI ANY12S(YUY2ToUVRow_Any_MMI, YUY2ToUVRow_MMI, 1, 4, 15) #endif #ifdef HAS_UYVYTOUVROW_MSA ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31) #endif #ifdef HAS_UYVYTOUVROW_MMI ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15) #endif #undef ANY12S // Any 1 to 1 with source stride (2 rows of source). Outputs UV plane. // 128 byte row allows for 32 avx ARGB pixels. #define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_vu, \ int width) { \ SIMD_ALIGNED(uint8_t temp[128 * 3]); \ memset(temp, 0, 128 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, src_stride, dst_vu, n); \ } \ memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ memcpy(temp + 128, src_ptr + src_stride + (n >> UVSHIFT) * BPP, \ SS(r, UVSHIFT) * BPP); \ if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ BPP); \ memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ } \ ANY_SIMD(temp, 128, temp + 256, MASK + 1); \ memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2); \ } #ifdef HAS_AYUVTOVUROW_NEON ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15) ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) #endif #undef ANY11S #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/row_common.cc000066400000000000000000004100121416500237200215120ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/row.h" #include #include #include // For memcpy and memset. #include "libyuv/basic_types.h" #include "libyuv/convert_argb.h" // For kYuvI601Constants #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This macro control YUV to RGB using unsigned math to extend range of // YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B: // LIBYUV_UNLIMITED_DATA // The following macro from row_win makes the C code match the row_win code, // which is 7 bit fixed point for ARGBToI420: #if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \ defined(_MSC_VER) && !defined(__clang__) && \ (defined(_M_IX86) || defined(_M_X64)) #define LIBYUV_RGB7 1 #endif #if !defined(LIBYUV_BIT_EXACT) && (defined(__x86_64__) || defined(_M_X64) || \ defined(__i386__) || defined(_M_IX86)) #define LIBYUV_ARGBTOUV_PAVGB 1 #define LIBYUV_RGBTOU_TRUNCATE 1 #define LIBYUV_ATTENUATE_DUP 1 #endif #if defined(LIBYUV_BIT_EXACT) #define LIBYUV_UNATTENUATE_DUP 1 #endif // llvm x86 is poor at ternary operator, so use branchless min/max. #define USE_BRANCHLESS 1 #if USE_BRANCHLESS static __inline int32_t clamp0(int32_t v) { return -(v >= 0) & v; } // TODO(fbarchard): make clamp255 preserve negative values. static __inline int32_t clamp255(int32_t v) { return (-(v >= 255) | v) & 255; } static __inline int32_t clamp1023(int32_t v) { return (-(v >= 1023) | v) & 1023; } // clamp to max static __inline int32_t ClampMax(int32_t v, int32_t max) { return (-(v >= max) | v) & max; } static __inline uint32_t Abs(int32_t v) { int m = -(v < 0); return (v + m) ^ m; } #else // USE_BRANCHLESS static __inline int32_t clamp0(int32_t v) { return (v < 0) ? 0 : v; } static __inline int32_t clamp255(int32_t v) { return (v > 255) ? 255 : v; } static __inline int32_t clamp1023(int32_t v) { return (v > 1023) ? 1023 : v; } static __inline int32_t ClampMax(int32_t v, int32_t max) { return (v > max) ? max : v; } static __inline uint32_t Abs(int32_t v) { return (v < 0) ? -v : v; } #endif // USE_BRANCHLESS static __inline uint32_t Clamp(int32_t val) { int v = clamp0(val); return (uint32_t)(clamp255(v)); } static __inline uint32_t Clamp10(int32_t val) { int v = clamp0(val); return (uint32_t)(clamp1023(v)); } // Little Endian #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) || \ (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) #define WRITEWORD(p, v) *(uint32_t*)(p) = v #else static inline void WRITEWORD(uint8_t* p, uint32_t v) { p[0] = (uint8_t)(v & 255); p[1] = (uint8_t)((v >> 8) & 255); p[2] = (uint8_t)((v >> 16) & 255); p[3] = (uint8_t)((v >> 24) & 255); } #endif void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { uint8_t b = src_rgb24[0]; uint8_t g = src_rgb24[1]; uint8_t r = src_rgb24[2]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; dst_argb[3] = 255u; dst_argb += 4; src_rgb24 += 3; } } void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { uint8_t r = src_raw[0]; uint8_t g = src_raw[1]; uint8_t b = src_raw[2]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; dst_argb[3] = 255u; dst_argb += 4; src_raw += 3; } } void RAWToRGBARow_C(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { int x; for (x = 0; x < width; ++x) { uint8_t r = src_raw[0]; uint8_t g = src_raw[1]; uint8_t b = src_raw[2]; dst_rgba[0] = 255u; dst_rgba[1] = b; dst_rgba[2] = g; dst_rgba[3] = r; dst_rgba += 4; src_raw += 3; } } void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { int x; for (x = 0; x < width; ++x) { uint8_t r = src_raw[0]; uint8_t g = src_raw[1]; uint8_t b = src_raw[2]; dst_rgb24[0] = b; dst_rgb24[1] = g; dst_rgb24[2] = r; dst_rgb24 += 3; src_raw += 3; } } void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { uint8_t b = src_rgb565[0] & 0x1f; uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); uint8_t r = src_rgb565[1] >> 3; dst_argb[0] = (b << 3) | (b >> 2); dst_argb[1] = (g << 2) | (g >> 4); dst_argb[2] = (r << 3) | (r >> 2); dst_argb[3] = 255u; dst_argb += 4; src_rgb565 += 2; } } void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { uint8_t b = src_argb1555[0] & 0x1f; uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); uint8_t r = (src_argb1555[1] & 0x7c) >> 2; uint8_t a = src_argb1555[1] >> 7; dst_argb[0] = (b << 3) | (b >> 2); dst_argb[1] = (g << 3) | (g >> 2); dst_argb[2] = (r << 3) | (r >> 2); dst_argb[3] = -a; dst_argb += 4; src_argb1555 += 2; } } void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { uint8_t b = src_argb4444[0] & 0x0f; uint8_t g = src_argb4444[0] >> 4; uint8_t r = src_argb4444[1] & 0x0f; uint8_t a = src_argb4444[1] >> 4; dst_argb[0] = (b << 4) | b; dst_argb[1] = (g << 4) | g; dst_argb[2] = (r << 4) | r; dst_argb[3] = (a << 4) | a; dst_argb += 4; src_argb4444 += 2; } } void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { uint32_t ar30; memcpy(&ar30, src_ar30, sizeof ar30); uint32_t b = (ar30 >> 2) & 0xff; uint32_t g = (ar30 >> 12) & 0xff; uint32_t r = (ar30 >> 22) & 0xff; uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24); dst_argb += 4; src_ar30 += 4; } } void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) { int x; for (x = 0; x < width; ++x) { uint32_t ar30; memcpy(&ar30, src_ar30, sizeof ar30); uint32_t b = (ar30 >> 2) & 0xff; uint32_t g = (ar30 >> 12) & 0xff; uint32_t r = (ar30 >> 22) & 0xff; uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits. *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24); dst_abgr += 4; src_ar30 += 4; } } void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) { int x; for (x = 0; x < width; ++x) { uint32_t ar30; memcpy(&ar30, src_ar30, sizeof ar30); uint32_t b = ar30 & 0x3ff; uint32_t ga = ar30 & 0xc00ffc00; uint32_t r = (ar30 >> 20) & 0x3ff; *(uint32_t*)(dst_ab30) = r | ga | (b << 20); dst_ab30 += 4; src_ar30 += 4; } } void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width; ++x) { uint8_t b = src_argb[0]; uint8_t g = src_argb[1]; uint8_t r = src_argb[2]; dst_rgb[0] = b; dst_rgb[1] = g; dst_rgb[2] = r; dst_rgb += 3; src_argb += 4; } } void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width; ++x) { uint8_t b = src_argb[0]; uint8_t g = src_argb[1]; uint8_t r = src_argb[2]; dst_rgb[0] = r; dst_rgb[1] = g; dst_rgb[2] = b; dst_rgb += 3; src_argb += 4; } } void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { uint8_t b0 = src_argb[0] >> 3; uint8_t g0 = src_argb[1] >> 2; uint8_t r0 = src_argb[2] >> 3; uint8_t b1 = src_argb[4] >> 3; uint8_t g1 = src_argb[5] >> 2; uint8_t r1 = src_argb[6] >> 3; WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27)); dst_rgb += 4; src_argb += 8; } if (width & 1) { uint8_t b0 = src_argb[0] >> 3; uint8_t g0 = src_argb[1] >> 2; uint8_t r0 = src_argb[2] >> 3; *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); } } // dither4 is a row of 4 values from 4x4 dither matrix. // The 4x4 matrix contains values to increase RGB. When converting to // fewer bits (565) this provides an ordered dither. // The order in the 4x4 matrix in first byte is upper left. // The 4 values are passed as an int, then referenced as an array, so // endian will not affect order of the original matrix. But the dither4 // will containing the first pixel in the lower byte for little endian // or the upper byte for big endian. void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width) { int x; for (x = 0; x < width - 1; x += 2) { int dither0 = ((const unsigned char*)(&dither4))[x & 3]; int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3]; uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3; uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2; uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3; *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 11); *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 11); dst_rgb += 4; src_argb += 8; } if (width & 1) { int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3]; uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3; uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2; uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3; *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); } } void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { uint8_t b0 = src_argb[0] >> 3; uint8_t g0 = src_argb[1] >> 3; uint8_t r0 = src_argb[2] >> 3; uint8_t a0 = src_argb[3] >> 7; uint8_t b1 = src_argb[4] >> 3; uint8_t g1 = src_argb[5] >> 3; uint8_t r1 = src_argb[6] >> 3; uint8_t a1 = src_argb[7] >> 7; *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 5) | (r1 << 10) | (a1 << 15); dst_rgb += 4; src_argb += 8; } if (width & 1) { uint8_t b0 = src_argb[0] >> 3; uint8_t g0 = src_argb[1] >> 3; uint8_t r0 = src_argb[2] >> 3; uint8_t a0 = src_argb[3] >> 7; *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15); } } void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { uint8_t b0 = src_argb[0] >> 4; uint8_t g0 = src_argb[1] >> 4; uint8_t r0 = src_argb[2] >> 4; uint8_t a0 = src_argb[3] >> 4; uint8_t b1 = src_argb[4] >> 4; uint8_t g1 = src_argb[5] >> 4; uint8_t r1 = src_argb[6] >> 4; uint8_t a1 = src_argb[7] >> 4; *(uint16_t*)(dst_rgb + 0) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); *(uint16_t*)(dst_rgb + 2) = b1 | (g1 << 4) | (r1 << 8) | (a1 << 12); dst_rgb += 4; src_argb += 8; } if (width & 1) { uint8_t b0 = src_argb[0] >> 4; uint8_t g0 = src_argb[1] >> 4; uint8_t r0 = src_argb[2] >> 4; uint8_t a0 = src_argb[3] >> 4; *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12); } } void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { int x; for (x = 0; x < width; ++x) { uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2); uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2); uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2); uint32_t a0 = (src_abgr[3] >> 6); *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30); dst_ar30 += 4; src_abgr += 4; } } void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { int x; for (x = 0; x < width; ++x) { uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2); uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2); uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2); uint32_t a0 = (src_argb[3] >> 6); *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30); dst_ar30 += 4; src_argb += 4; } } void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { int x; for (x = 0; x < width; ++x) { dst_ar64[0] = src_argb[0] * 0x0101; dst_ar64[1] = src_argb[1] * 0x0101; dst_ar64[2] = src_argb[2] * 0x0101; dst_ar64[3] = src_argb[3] * 0x0101; dst_ar64 += 4; src_argb += 4; } } void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { int x; for (x = 0; x < width; ++x) { dst_ab64[0] = src_argb[2] * 0x0101; dst_ab64[1] = src_argb[1] * 0x0101; dst_ab64[2] = src_argb[0] * 0x0101; dst_ab64[3] = src_argb[3] * 0x0101; dst_ab64 += 4; src_argb += 4; } } void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { dst_argb[0] = src_ar64[0] >> 8; dst_argb[1] = src_ar64[1] >> 8; dst_argb[2] = src_ar64[2] >> 8; dst_argb[3] = src_ar64[3] >> 8; dst_argb += 4; src_ar64 += 4; } } void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { dst_argb[0] = src_ab64[2] >> 8; dst_argb[1] = src_ab64[1] >> 8; dst_argb[2] = src_ab64[0] >> 8; dst_argb[3] = src_ab64[3] >> 8; dst_argb += 4; src_ab64 += 4; } } // TODO(fbarchard): Make shuffle compatible with SIMD versions void AR64ShuffleRow_C(const uint8_t* src_ar64, uint8_t* dst_ar64, const uint8_t* shuffler, int width) { const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64; uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64; int index0 = shuffler[0] / 2; int index1 = shuffler[2] / 2; int index2 = shuffler[4] / 2; int index3 = shuffler[6] / 2; // Shuffle a row of AR64. int x; for (x = 0; x < width / 2; ++x) { // To support in-place conversion. uint16_t b = src_ar64_16[index0]; uint16_t g = src_ar64_16[index1]; uint16_t r = src_ar64_16[index2]; uint16_t a = src_ar64_16[index3]; dst_ar64_16[0] = b; dst_ar64_16[1] = g; dst_ar64_16[2] = r; dst_ar64_16[3] = a; src_ar64_16 += 4; dst_ar64_16 += 4; } } #ifdef LIBYUV_RGB7 // Old 7 bit math for compatibility on unsupported platforms. static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { return ((33 * r + 65 * g + 13 * b) >> 7) + 16; } #else // 8 bit // Intel SSE/AVX uses the following equivalent formula // 0x7e80 = (66 + 129 + 25) * -128 + 0x1000 (for +16) and 0x0080 for round. // return (66 * ((int)r - 128) + 129 * ((int)g - 128) + 25 * ((int)b - 128) + // 0x7e80) >> 8; static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { return (66 * r + 129 * g + 25 * b + 0x1080) >> 8; } #endif #define AVGB(a, b) (((a) + (b) + 1) >> 1) // LIBYUV_RGBTOU_TRUNCATE mimics x86 code that does not round. #ifdef LIBYUV_RGBTOU_TRUNCATE static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { return (112 * b - 74 * g - 38 * r + 0x8000) >> 8; } static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { return (112 * r - 94 * g - 18 * b + 0x8000) >> 8; } #else // TODO(fbarchard): Add rounding to x86 SIMD and use this static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; } static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; } #endif // LIBYUV_ARGBTOUV_PAVGB mimics x86 code that subsamples with 2 pavgb. #if !defined(LIBYUV_ARGBTOUV_PAVGB) static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) { return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8; } static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8; } #endif // ARGBToY_C and ARGBToUV_C // Intel version mimic SSE/AVX which does 2 pavgb #if LIBYUV_ARGBTOUV_PAVGB #define MAKEROWY(NAME, R, G, B, BPP) \ void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ int x; \ for (x = 0; x < width; ++x) { \ dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ src_rgb += BPP; \ dst_y += 1; \ } \ } \ void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ uint8_t* dst_u, uint8_t* dst_v, int width) { \ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ int x; \ for (x = 0; x < width - 1; x += 2) { \ uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ dst_u[0] = RGBToU(ar, ag, ab); \ dst_v[0] = RGBToV(ar, ag, ab); \ src_rgb += BPP * 2; \ src_rgb1 += BPP * 2; \ dst_u += 1; \ dst_v += 1; \ } \ if (width & 1) { \ uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ dst_u[0] = RGBToU(ar, ag, ab); \ dst_v[0] = RGBToV(ar, ag, ab); \ } \ } #else // ARM version does sum / 2 then multiply by 2x smaller coefficients #define MAKEROWY(NAME, R, G, B, BPP) \ void NAME##ToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ int x; \ for (x = 0; x < width; ++x) { \ dst_y[0] = RGBToY(src_rgb[R], src_rgb[G], src_rgb[B]); \ src_rgb += BPP; \ dst_y += 1; \ } \ } \ void NAME##ToUVRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ uint8_t* dst_u, uint8_t* dst_v, int width) { \ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ int x; \ for (x = 0; x < width - 1; x += 2) { \ uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ src_rgb1[B + BPP] + 1) >> \ 1; \ uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ src_rgb1[G + BPP] + 1) >> \ 1; \ uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ src_rgb1[R + BPP] + 1) >> \ 1; \ dst_u[0] = RGB2xToU(ar, ag, ab); \ dst_v[0] = RGB2xToV(ar, ag, ab); \ src_rgb += BPP * 2; \ src_rgb1 += BPP * 2; \ dst_u += 1; \ dst_v += 1; \ } \ if (width & 1) { \ uint16_t ab = src_rgb[B] + src_rgb1[B]; \ uint16_t ag = src_rgb[G] + src_rgb1[G]; \ uint16_t ar = src_rgb[R] + src_rgb1[R]; \ dst_u[0] = RGB2xToU(ar, ag, ab); \ dst_v[0] = RGB2xToV(ar, ag, ab); \ } \ } #endif MAKEROWY(ARGB, 2, 1, 0, 4) MAKEROWY(BGRA, 1, 2, 3, 4) MAKEROWY(ABGR, 0, 1, 2, 4) MAKEROWY(RGBA, 3, 2, 1, 4) MAKEROWY(RGB24, 2, 1, 0, 3) MAKEROWY(RAW, 0, 1, 2, 3) #undef MAKEROWY // JPeg uses a variation on BT.601-1 full range // y = 0.29900 * r + 0.58700 * g + 0.11400 * b // u = -0.16874 * r - 0.33126 * g + 0.50000 * b + center // v = 0.50000 * r - 0.41869 * g - 0.08131 * b + center // BT.601 Mpeg range uses: // b 0.1016 * 255 = 25.908 = 25 // g 0.5078 * 255 = 129.489 = 129 // r 0.2578 * 255 = 65.739 = 66 // JPeg 7 bit Y (deprecated) // b 0.11400 * 128 = 14.592 = 15 // g 0.58700 * 128 = 75.136 = 75 // r 0.29900 * 128 = 38.272 = 38 // JPeg 8 bit Y: // b 0.11400 * 256 = 29.184 = 29 // g 0.58700 * 256 = 150.272 = 150 // r 0.29900 * 256 = 76.544 = 77 // JPeg 8 bit U: // b 0.50000 * 255 = 127.5 = 127 // g -0.33126 * 255 = -84.4713 = -84 // r -0.16874 * 255 = -43.0287 = -43 // JPeg 8 bit V: // b -0.08131 * 255 = -20.73405 = -20 // g -0.41869 * 255 = -106.76595 = -107 // r 0.50000 * 255 = 127.5 = 127 #ifdef LIBYUV_RGB7 // Old 7 bit math for compatibility on unsupported platforms. static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { return (38 * r + 75 * g + 15 * b + 64) >> 7; } #else // 8 bit static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) { return (77 * r + 150 * g + 29 * b + 128) >> 8; } #endif #if defined(LIBYUV_ARGBTOUV_PAVGB) static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * b - 84 * g - 43 * r + 0x8080) >> 8; } static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; } #else static __inline int RGB2xToUJ(uint16_t r, uint16_t g, uint16_t b) { return ((127 / 2) * b - (84 / 2) * g - (43 / 2) * r + 0x8080) >> 8; } static __inline int RGB2xToVJ(uint16_t r, uint16_t g, uint16_t b) { return ((127 / 2) * r - (107 / 2) * g - (20 / 2) * b + 0x8080) >> 8; } #endif // ARGBToYJ_C and ARGBToUVJ_C // Intel version mimic SSE/AVX which does 2 pavgb #if LIBYUV_ARGBTOUV_PAVGB #define MAKEROWYJ(NAME, R, G, B, BPP) \ void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ int x; \ for (x = 0; x < width; ++x) { \ dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ src_rgb += BPP; \ dst_y += 1; \ } \ } \ void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ uint8_t* dst_u, uint8_t* dst_v, int width) { \ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ int x; \ for (x = 0; x < width - 1; x += 2) { \ uint8_t ab = AVGB(AVGB(src_rgb[B], src_rgb1[B]), \ AVGB(src_rgb[B + BPP], src_rgb1[B + BPP])); \ uint8_t ag = AVGB(AVGB(src_rgb[G], src_rgb1[G]), \ AVGB(src_rgb[G + BPP], src_rgb1[G + BPP])); \ uint8_t ar = AVGB(AVGB(src_rgb[R], src_rgb1[R]), \ AVGB(src_rgb[R + BPP], src_rgb1[R + BPP])); \ dst_u[0] = RGBToUJ(ar, ag, ab); \ dst_v[0] = RGBToVJ(ar, ag, ab); \ src_rgb += BPP * 2; \ src_rgb1 += BPP * 2; \ dst_u += 1; \ dst_v += 1; \ } \ if (width & 1) { \ uint8_t ab = AVGB(src_rgb[B], src_rgb1[B]); \ uint8_t ag = AVGB(src_rgb[G], src_rgb1[G]); \ uint8_t ar = AVGB(src_rgb[R], src_rgb1[R]); \ dst_u[0] = RGBToUJ(ar, ag, ab); \ dst_v[0] = RGBToVJ(ar, ag, ab); \ } \ } #else // ARM version does sum / 2 then multiply by 2x smaller coefficients #define MAKEROWYJ(NAME, R, G, B, BPP) \ void NAME##ToYJRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width) { \ int x; \ for (x = 0; x < width; ++x) { \ dst_y[0] = RGBToYJ(src_rgb[R], src_rgb[G], src_rgb[B]); \ src_rgb += BPP; \ dst_y += 1; \ } \ } \ void NAME##ToUVJRow_C(const uint8_t* src_rgb, int src_stride_rgb, \ uint8_t* dst_u, uint8_t* dst_v, int width) { \ const uint8_t* src_rgb1 = src_rgb + src_stride_rgb; \ int x; \ for (x = 0; x < width - 1; x += 2) { \ uint16_t ab = (src_rgb[B] + src_rgb[B + BPP] + src_rgb1[B] + \ src_rgb1[B + BPP] + 1) >> \ 1; \ uint16_t ag = (src_rgb[G] + src_rgb[G + BPP] + src_rgb1[G] + \ src_rgb1[G + BPP] + 1) >> \ 1; \ uint16_t ar = (src_rgb[R] + src_rgb[R + BPP] + src_rgb1[R] + \ src_rgb1[R + BPP] + 1) >> \ 1; \ dst_u[0] = RGB2xToUJ(ar, ag, ab); \ dst_v[0] = RGB2xToVJ(ar, ag, ab); \ src_rgb += BPP * 2; \ src_rgb1 += BPP * 2; \ dst_u += 1; \ dst_v += 1; \ } \ if (width & 1) { \ uint16_t ab = (src_rgb[B] + src_rgb1[B]); \ uint16_t ag = (src_rgb[G] + src_rgb1[G]); \ uint16_t ar = (src_rgb[R] + src_rgb1[R]); \ dst_u[0] = RGB2xToUJ(ar, ag, ab); \ dst_v[0] = RGB2xToVJ(ar, ag, ab); \ } \ } #endif MAKEROWYJ(ARGB, 2, 1, 0, 4) MAKEROWYJ(RGBA, 3, 2, 1, 4) MAKEROWYJ(RGB24, 2, 1, 0, 3) MAKEROWYJ(RAW, 0, 1, 2, 3) #undef MAKEROWYJ void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { uint8_t b = src_rgb565[0] & 0x1f; uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); uint8_t r = src_rgb565[1] >> 3; b = (b << 3) | (b >> 2); g = (g << 2) | (g >> 4); r = (r << 3) | (r >> 2); dst_y[0] = RGBToY(r, g, b); src_rgb565 += 2; dst_y += 1; } } void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { uint8_t b = src_argb1555[0] & 0x1f; uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); uint8_t r = (src_argb1555[1] & 0x7c) >> 2; b = (b << 3) | (b >> 2); g = (g << 3) | (g >> 2); r = (r << 3) | (r >> 2); dst_y[0] = RGBToY(r, g, b); src_argb1555 += 2; dst_y += 1; } } void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { int x; for (x = 0; x < width; ++x) { uint8_t b = src_argb4444[0] & 0x0f; uint8_t g = src_argb4444[0] >> 4; uint8_t r = src_argb4444[1] & 0x0f; b = (b << 4) | b; g = (g << 4) | g; r = (r << 4) | r; dst_y[0] = RGBToY(r, g, b); src_argb4444 += 2; dst_y += 1; } } void RGB565ToUVRow_C(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565; int x; for (x = 0; x < width - 1; x += 2) { uint8_t b0 = src_rgb565[0] & 0x1f; uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); uint8_t r0 = src_rgb565[1] >> 3; uint8_t b1 = src_rgb565[2] & 0x1f; uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3); uint8_t r1 = src_rgb565[3] >> 3; uint8_t b2 = next_rgb565[0] & 0x1f; uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); uint8_t r2 = next_rgb565[1] >> 3; uint8_t b3 = next_rgb565[2] & 0x1f; uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); uint8_t r3 = next_rgb565[3] >> 3; b0 = (b0 << 3) | (b0 >> 2); g0 = (g0 << 2) | (g0 >> 4); r0 = (r0 << 3) | (r0 >> 2); b1 = (b1 << 3) | (b1 >> 2); g1 = (g1 << 2) | (g1 >> 4); r1 = (r1 << 3) | (r1 >> 2); b2 = (b2 << 3) | (b2 >> 2); g2 = (g2 << 2) | (g2 >> 4); r2 = (r2 << 3) | (r2 >> 2); b3 = (b3 << 3) | (b3 >> 2); g3 = (g3 << 2) | (g3 >> 4); r3 = (r3 << 3) | (r3 >> 2); #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; dst_u[0] = RGB2xToU(r, g, b); dst_v[0] = RGB2xToV(r, g, b); #endif src_rgb565 += 4; next_rgb565 += 4; dst_u += 1; dst_v += 1; } if (width & 1) { uint8_t b0 = src_rgb565[0] & 0x1f; uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3); uint8_t r0 = src_rgb565[1] >> 3; uint8_t b2 = next_rgb565[0] & 0x1f; uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); uint8_t r2 = next_rgb565[1] >> 3; b0 = (b0 << 3) | (b0 >> 2); g0 = (g0 << 2) | (g0 >> 4); r0 = (r0 << 3) | (r0 >> 2); b2 = (b2 << 3) | (b2 >> 2); g2 = (g2 << 2) | (g2 >> 4); r2 = (r2 << 3) | (r2 >> 2); #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(b0, b2); uint8_t ag = AVGB(g0, g2); uint8_t ar = AVGB(r0, r2); dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else uint16_t b = b0 + b2; uint16_t g = g0 + g2; uint16_t r = r0 + r2; dst_u[0] = RGB2xToU(r, g, b); dst_v[0] = RGB2xToV(r, g, b); #endif } } void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555; int x; for (x = 0; x < width - 1; x += 2) { uint8_t b0 = src_argb1555[0] & 0x1f; uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; uint8_t b1 = src_argb1555[2] & 0x1f; uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2; uint8_t b2 = next_argb1555[0] & 0x1f; uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2; uint8_t b3 = next_argb1555[2] & 0x1f; uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2; b0 = (b0 << 3) | (b0 >> 2); g0 = (g0 << 3) | (g0 >> 2); r0 = (r0 << 3) | (r0 >> 2); b1 = (b1 << 3) | (b1 >> 2); g1 = (g1 << 3) | (g1 >> 2); r1 = (r1 << 3) | (r1 >> 2); b2 = (b2 << 3) | (b2 >> 2); g2 = (g2 << 3) | (g2 >> 2); r2 = (r2 << 3) | (r2 >> 2); b3 = (b3 << 3) | (b3 >> 2); g3 = (g3 << 3) | (g3 >> 2); r3 = (r3 << 3) | (r3 >> 2); #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; dst_u[0] = RGB2xToU(r, g, b); dst_v[0] = RGB2xToV(r, g, b); #endif src_argb1555 += 4; next_argb1555 += 4; dst_u += 1; dst_v += 1; } if (width & 1) { uint8_t b0 = src_argb1555[0] & 0x1f; uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2; uint8_t b2 = next_argb1555[0] & 0x1f; uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2; b0 = (b0 << 3) | (b0 >> 2); g0 = (g0 << 3) | (g0 >> 2); r0 = (r0 << 3) | (r0 >> 2); b2 = (b2 << 3) | (b2 >> 2); g2 = (g2 << 3) | (g2 >> 2); r2 = (r2 << 3) | (r2 >> 2); #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(b0, b2); uint8_t ag = AVGB(g0, g2); uint8_t ar = AVGB(r0, r2); dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else uint16_t b = b0 + b2; uint16_t g = g0 + g2; uint16_t r = r0 + r2; dst_u[0] = RGB2xToU(r, g, b); dst_v[0] = RGB2xToV(r, g, b); #endif } } void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444; int x; for (x = 0; x < width - 1; x += 2) { uint8_t b0 = src_argb4444[0] & 0x0f; uint8_t g0 = src_argb4444[0] >> 4; uint8_t r0 = src_argb4444[1] & 0x0f; uint8_t b1 = src_argb4444[2] & 0x0f; uint8_t g1 = src_argb4444[2] >> 4; uint8_t r1 = src_argb4444[3] & 0x0f; uint8_t b2 = next_argb4444[0] & 0x0f; uint8_t g2 = next_argb4444[0] >> 4; uint8_t r2 = next_argb4444[1] & 0x0f; uint8_t b3 = next_argb4444[2] & 0x0f; uint8_t g3 = next_argb4444[2] >> 4; uint8_t r3 = next_argb4444[3] & 0x0f; b0 = (b0 << 4) | b0; g0 = (g0 << 4) | g0; r0 = (r0 << 4) | r0; b1 = (b1 << 4) | b1; g1 = (g1 << 4) | g1; r1 = (r1 << 4) | r1; b2 = (b2 << 4) | b2; g2 = (g2 << 4) | g2; r2 = (r2 << 4) | r2; b3 = (b3 << 4) | b3; g3 = (g3 << 4) | g3; r3 = (r3 << 4) | r3; #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; uint16_t r = (r0 + r1 + r2 + r3 + 1) >> 1; dst_u[0] = RGB2xToU(r, g, b); dst_v[0] = RGB2xToV(r, g, b); #endif src_argb4444 += 4; next_argb4444 += 4; dst_u += 1; dst_v += 1; } if (width & 1) { uint8_t b0 = src_argb4444[0] & 0x0f; uint8_t g0 = src_argb4444[0] >> 4; uint8_t r0 = src_argb4444[1] & 0x0f; uint8_t b2 = next_argb4444[0] & 0x0f; uint8_t g2 = next_argb4444[0] >> 4; uint8_t r2 = next_argb4444[1] & 0x0f; b0 = (b0 << 4) | b0; g0 = (g0 << 4) | g0; r0 = (r0 << 4) | r0; b2 = (b2 << 4) | b2; g2 = (g2 << 4) | g2; r2 = (r2 << 4) | r2; #if LIBYUV_ARGBTOUV_PAVGB uint8_t ab = AVGB(b0, b2); uint8_t ag = AVGB(g0, g2); uint8_t ar = AVGB(r0, r2); dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); #else uint16_t b = b0 + b2; uint16_t g = g0 + g2; uint16_t r = r0 + r2; dst_u[0] = RGB2xToU(r, g, b); dst_v[0] = RGB2xToV(r, g, b); #endif } } void ARGBToUV444Row_C(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; for (x = 0; x < width; ++x) { uint8_t ab = src_argb[0]; uint8_t ag = src_argb[1]; uint8_t ar = src_argb[2]; dst_u[0] = RGBToU(ar, ag, ab); dst_v[0] = RGBToV(ar, ag, ab); src_argb += 4; dst_u += 1; dst_v += 1; } } void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]); dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[3] = src_argb[3]; dst_argb += 4; src_argb += 4; } } // Convert a row of image to Sepia tone. void ARGBSepiaRow_C(uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; int g = dst_argb[1]; int r = dst_argb[2]; int sb = (b * 17 + g * 68 + r * 35) >> 7; int sg = (b * 22 + g * 88 + r * 45) >> 7; int sr = (b * 24 + g * 98 + r * 50) >> 7; // b does not over flow. a is preserved from original. dst_argb[0] = sb; dst_argb[1] = clamp255(sg); dst_argb[2] = clamp255(sr); dst_argb += 4; } } // Apply color matrix to a row of image. Matrix is signed. // TODO(fbarchard): Consider adding rounding (+32). void ARGBColorMatrixRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { int x; for (x = 0; x < width; ++x) { int b = src_argb[0]; int g = src_argb[1]; int r = src_argb[2]; int a = src_argb[3]; int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] + a * matrix_argb[3]) >> 6; int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] + a * matrix_argb[7]) >> 6; int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] + a * matrix_argb[11]) >> 6; int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] + a * matrix_argb[15]) >> 6; dst_argb[0] = Clamp(sb); dst_argb[1] = Clamp(sg); dst_argb[2] = Clamp(sr); dst_argb[3] = Clamp(sa); src_argb += 4; dst_argb += 4; } } // Apply color table to a row of image. void ARGBColorTableRow_C(uint8_t* dst_argb, const uint8_t* table_argb, int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; int g = dst_argb[1]; int r = dst_argb[2]; int a = dst_argb[3]; dst_argb[0] = table_argb[b * 4 + 0]; dst_argb[1] = table_argb[g * 4 + 1]; dst_argb[2] = table_argb[r * 4 + 2]; dst_argb[3] = table_argb[a * 4 + 3]; dst_argb += 4; } } // Apply color table to a row of image. void RGBColorTableRow_C(uint8_t* dst_argb, const uint8_t* table_argb, int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; int g = dst_argb[1]; int r = dst_argb[2]; dst_argb[0] = table_argb[b * 4 + 0]; dst_argb[1] = table_argb[g * 4 + 1]; dst_argb[2] = table_argb[r * 4 + 2]; dst_argb += 4; } } void ARGBQuantizeRow_C(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width) { int x; for (x = 0; x < width; ++x) { int b = dst_argb[0]; int g = dst_argb[1]; int r = dst_argb[2]; dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset; dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset; dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset; dst_argb += 4; } } #define REPEAT8(v) (v) | ((v) << 8) #define SHADE(f, v) v* f >> 24 void ARGBShadeRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { const uint32_t b_scale = REPEAT8(value & 0xff); const uint32_t g_scale = REPEAT8((value >> 8) & 0xff); const uint32_t r_scale = REPEAT8((value >> 16) & 0xff); const uint32_t a_scale = REPEAT8(value >> 24); int i; for (i = 0; i < width; ++i) { const uint32_t b = REPEAT8(src_argb[0]); const uint32_t g = REPEAT8(src_argb[1]); const uint32_t r = REPEAT8(src_argb[2]); const uint32_t a = REPEAT8(src_argb[3]); dst_argb[0] = SHADE(b, b_scale); dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); dst_argb[3] = SHADE(a, a_scale); src_argb += 4; dst_argb += 4; } } #undef REPEAT8 #undef SHADE #define REPEAT8(v) (v) | ((v) << 8) #define SHADE(f, v) v* f >> 16 void ARGBMultiplyRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { const uint32_t b = REPEAT8(src_argb[0]); const uint32_t g = REPEAT8(src_argb[1]); const uint32_t r = REPEAT8(src_argb[2]); const uint32_t a = REPEAT8(src_argb[3]); const uint32_t b_scale = src_argb1[0]; const uint32_t g_scale = src_argb1[1]; const uint32_t r_scale = src_argb1[2]; const uint32_t a_scale = src_argb1[3]; dst_argb[0] = SHADE(b, b_scale); dst_argb[1] = SHADE(g, g_scale); dst_argb[2] = SHADE(r, r_scale); dst_argb[3] = SHADE(a, a_scale); src_argb += 4; src_argb1 += 4; dst_argb += 4; } } #undef REPEAT8 #undef SHADE #define SHADE(f, v) clamp255(v + f) void ARGBAddRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { const int b = src_argb[0]; const int g = src_argb[1]; const int r = src_argb[2]; const int a = src_argb[3]; const int b_add = src_argb1[0]; const int g_add = src_argb1[1]; const int r_add = src_argb1[2]; const int a_add = src_argb1[3]; dst_argb[0] = SHADE(b, b_add); dst_argb[1] = SHADE(g, g_add); dst_argb[2] = SHADE(r, r_add); dst_argb[3] = SHADE(a, a_add); src_argb += 4; src_argb1 += 4; dst_argb += 4; } } #undef SHADE #define SHADE(f, v) clamp0(f - v) void ARGBSubtractRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { const int b = src_argb[0]; const int g = src_argb[1]; const int r = src_argb[2]; const int a = src_argb[3]; const int b_sub = src_argb1[0]; const int g_sub = src_argb1[1]; const int r_sub = src_argb1[2]; const int a_sub = src_argb1[3]; dst_argb[0] = SHADE(b, b_sub); dst_argb[1] = SHADE(g, g_sub); dst_argb[2] = SHADE(r, r_sub); dst_argb[3] = SHADE(a, a_sub); src_argb += 4; src_argb1 += 4; dst_argb += 4; } } #undef SHADE // Sobel functions which mimics SSSE3. void SobelXRow_C(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { int i; for (i = 0; i < width; ++i) { int a = src_y0[i]; int b = src_y1[i]; int c = src_y2[i]; int a_sub = src_y0[i + 2]; int b_sub = src_y1[i + 2]; int c_sub = src_y2[i + 2]; int a_diff = a - a_sub; int b_diff = b - b_sub; int c_diff = c - c_sub; int sobel = Abs(a_diff + b_diff * 2 + c_diff); dst_sobelx[i] = (uint8_t)(clamp255(sobel)); } } void SobelYRow_C(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { int i; for (i = 0; i < width; ++i) { int a = src_y0[i + 0]; int b = src_y0[i + 1]; int c = src_y0[i + 2]; int a_sub = src_y1[i + 0]; int b_sub = src_y1[i + 1]; int c_sub = src_y1[i + 2]; int a_diff = a - a_sub; int b_diff = b - b_sub; int c_diff = c - c_sub; int sobel = Abs(a_diff + b_diff * 2 + c_diff); dst_sobely[i] = (uint8_t)(clamp255(sobel)); } } void SobelRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int s = clamp255(r + b); dst_argb[0] = (uint8_t)(s); dst_argb[1] = (uint8_t)(s); dst_argb[2] = (uint8_t)(s); dst_argb[3] = (uint8_t)(255u); dst_argb += 4; } } void SobelToPlaneRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int s = clamp255(r + b); dst_y[i] = (uint8_t)(s); } } void SobelXYRow_C(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { int r = src_sobelx[i]; int b = src_sobely[i]; int g = clamp255(r + b); dst_argb[0] = (uint8_t)(b); dst_argb[1] = (uint8_t)(g); dst_argb[2] = (uint8_t)(r); dst_argb[3] = (uint8_t)(255u); dst_argb += 4; } } void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { // Copy a Y to RGB. int x; for (x = 0; x < width; ++x) { uint8_t y = src_y[0]; dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[3] = 255u; dst_argb += 4; ++src_y; } } // Macros to create SIMD specific yuv to rgb conversion constants. // clang-format off #if defined(__aarch64__) || defined(__arm__) // Bias values include subtract 128 from U and V, bias from Y and rounding. // For B and R bias is negative. For G bias is positive. #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ {{UB, VR, UG, VG, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, \ {YG, (UB * 128 - YB), (UG * 128 + VG * 128 + YB), (VR * 128 - YB), YB, 0, \ 0, 0}} #else #define YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR) \ {{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \ UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \ {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \ {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \ 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \ {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \ {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}} #endif // clang-format on #define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR) \ const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = \ YUVCONSTANTSBODY(YG, YB, UB, UG, VG, VR); \ const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \ YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB); // TODO(fbarchard): Generate SIMD structures from float matrix. // BT.601 limited range YUV to RGB reference // R = (Y - 16) * 1.164 + V * 1.596 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 // B = (Y - 16) * 1.164 + U * 2.018 // KR = 0.299; KB = 0.114 // U and V contributions to R,G,B. #ifdef LIBYUV_UNLIMITED_DATA #define UB 129 /* round(2.018 * 64) */ #else #define UB 128 /* max(128, round(2.018 * 64)) */ #endif #define UG 25 /* round(0.391 * 64) */ #define VG 52 /* round(0.813 * 64) */ #define VR 102 /* round(1.596 * 64) */ // Y contribution to R,G,B. Scale and bias. #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR) #undef YG #undef YB #undef UB #undef UG #undef VG #undef VR // BT.601 full range YUV to RGB reference (aka JPEG) // * R = Y + V * 1.40200 // * G = Y - U * 0.34414 - V * 0.71414 // * B = Y + U * 1.77200 // KR = 0.299; KB = 0.114 // U and V contributions to R,G,B. #define UB 113 /* round(1.77200 * 64) */ #define UG 22 /* round(0.34414 * 64) */ #define VG 46 /* round(0.71414 * 64) */ #define VR 90 /* round(1.40200 * 64) */ // Y contribution to R,G,B. Scale and bias. #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ #define YB 32 /* 64 / 2 */ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR) #undef YG #undef YB #undef UB #undef UG #undef VG #undef VR // BT.709 limited range YUV to RGB reference // R = (Y - 16) * 1.164 + V * 1.793 // G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533 // B = (Y - 16) * 1.164 + U * 2.112 // KR = 0.2126, KB = 0.0722 // U and V contributions to R,G,B. #ifdef LIBYUV_UNLIMITED_DATA #define UB 135 /* round(2.112 * 64) */ #else #define UB 128 /* max(128, round(2.112 * 64)) */ #endif #define UG 14 /* round(0.213 * 64) */ #define VG 34 /* round(0.533 * 64) */ #define VR 115 /* round(1.793 * 64) */ // Y contribution to R,G,B. Scale and bias. #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR) #undef YG #undef YB #undef UB #undef UG #undef VG #undef VR // BT.709 full range YUV to RGB reference // R = Y + V * 1.5748 // G = Y - U * 0.18732 - V * 0.46812 // B = Y + U * 1.8556 // KR = 0.2126, KB = 0.0722 // U and V contributions to R,G,B. #define UB 119 /* round(1.8556 * 64) */ #define UG 12 /* round(0.18732 * 64) */ #define VG 30 /* round(0.46812 * 64) */ #define VR 101 /* round(1.5748 * 64) */ // Y contribution to R,G,B. Scale and bias. (same as jpeg) #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ #define YB 32 /* 64 / 2 */ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR) #undef YG #undef YB #undef UB #undef UG #undef VG #undef VR // BT.2020 limited range YUV to RGB reference // R = (Y - 16) * 1.164384 + V * 1.67867 // G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042 // B = (Y - 16) * 1.164384 + U * 2.14177 // KR = 0.2627; KB = 0.0593 // U and V contributions to R,G,B. #ifdef LIBYUV_UNLIMITED_DATA #define UB 137 /* round(2.142 * 64) */ #else #define UB 128 /* max(128, round(2.142 * 64)) */ #endif #define UG 12 /* round(0.187326 * 64) */ #define VG 42 /* round(0.65042 * 64) */ #define VR 107 /* round(1.67867 * 64) */ // Y contribution to R,G,B. Scale and bias. #define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */ #define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */ MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR) #undef YG #undef YB #undef UB #undef UG #undef VG #undef VR // BT.2020 full range YUV to RGB reference // R = Y + V * 1.474600 // G = Y - U * 0.164553 - V * 0.571353 // B = Y + U * 1.881400 // KR = 0.2627; KB = 0.0593 #define UB 120 /* round(1.881400 * 64) */ #define UG 11 /* round(0.164553 * 64) */ #define VG 37 /* round(0.571353 * 64) */ #define VR 94 /* round(1.474600 * 64) */ // Y contribution to R,G,B. Scale and bias. (same as jpeg) #define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */ #define YB 32 /* 64 / 2 */ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR) #undef YG #undef YB #undef UB #undef UG #undef VG #undef VR #undef BB #undef BG #undef BR #undef MAKEYUVCONSTANTS #if defined(__aarch64__) || defined(__arm__) #define LOAD_YUV_CONSTANTS \ int ub = yuvconstants->kUVCoeff[0]; \ int vr = yuvconstants->kUVCoeff[1]; \ int ug = yuvconstants->kUVCoeff[2]; \ int vg = yuvconstants->kUVCoeff[3]; \ int yg = yuvconstants->kRGBCoeffBias[0]; \ int bb = yuvconstants->kRGBCoeffBias[1]; \ int bg = yuvconstants->kRGBCoeffBias[2]; \ int br = yuvconstants->kRGBCoeffBias[3] #define CALC_RGB16 \ int32_t y1 = (uint32_t)(y32 * yg) >> 16; \ int b16 = y1 + (u * ub) - bb; \ int g16 = y1 + bg - (u * ug + v * vg); \ int r16 = y1 + (v * vr) - br #else #define LOAD_YUV_CONSTANTS \ int ub = yuvconstants->kUVToB[0]; \ int ug = yuvconstants->kUVToG[0]; \ int vg = yuvconstants->kUVToG[1]; \ int vr = yuvconstants->kUVToR[1]; \ int yg = yuvconstants->kYToRgb[0]; \ int yb = yuvconstants->kYBiasToRgb[0] #define CALC_RGB16 \ int32_t y1 = ((uint32_t)(y32 * yg) >> 16) + yb; \ int8_t ui = u; \ int8_t vi = v; \ ui -= 0x80; \ vi -= 0x80; \ int b16 = y1 + (ui * ub); \ int g16 = y1 - (ui * ug + vi * vg); \ int r16 = y1 + (vi * vr) #endif // C reference code that mimics the YUV assembly. // Reads 8 bit YUV and leaves result as 16 bit. static __inline void YuvPixel(uint8_t y, uint8_t u, uint8_t v, uint8_t* b, uint8_t* g, uint8_t* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; uint32_t y32 = y * 0x0101; CALC_RGB16; *b = Clamp((int32_t)(b16) >> 6); *g = Clamp((int32_t)(g16) >> 6); *r = Clamp((int32_t)(r16) >> 6); } // Reads 8 bit YUV and leaves result as 16 bit. static __inline void YuvPixel8_16(uint8_t y, uint8_t u, uint8_t v, int* b, int* g, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; uint32_t y32 = y * 0x0101; CALC_RGB16; *b = b16; *g = g16; *r = r16; } // C reference code that mimics the YUV 16 bit assembly. // Reads 10 bit YUV and leaves result as 16 bit. static __inline void YuvPixel10_16(uint16_t y, uint16_t u, uint16_t v, int* b, int* g, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; uint32_t y32 = y << 6; u = clamp255(u >> 2); v = clamp255(v >> 2); CALC_RGB16; *b = b16; *g = g16; *r = r16; } // C reference code that mimics the YUV 16 bit assembly. // Reads 12 bit YUV and leaves result as 16 bit. static __inline void YuvPixel12_16(int16_t y, int16_t u, int16_t v, int* b, int* g, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; uint32_t y32 = y << 4; u = clamp255(u >> 4); v = clamp255(v >> 4); CALC_RGB16; *b = b16; *g = g16; *r = r16; } // C reference code that mimics the YUV 10 bit assembly. // Reads 10 bit YUV and clamps down to 8 bit RGB. static __inline void YuvPixel10(uint16_t y, uint16_t u, uint16_t v, uint8_t* b, uint8_t* g, uint8_t* r, const struct YuvConstants* yuvconstants) { int b16; int g16; int r16; YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants); *b = Clamp(b16 >> 6); *g = Clamp(g16 >> 6); *r = Clamp(r16 >> 6); } // C reference code that mimics the YUV 12 bit assembly. // Reads 12 bit YUV and clamps down to 8 bit RGB. static __inline void YuvPixel12(uint16_t y, uint16_t u, uint16_t v, uint8_t* b, uint8_t* g, uint8_t* r, const struct YuvConstants* yuvconstants) { int b16; int g16; int r16; YuvPixel12_16(y, u, v, &b16, &g16, &r16, yuvconstants); *b = Clamp(b16 >> 6); *g = Clamp(g16 >> 6); *r = Clamp(r16 >> 6); } // C reference code that mimics the YUV 16 bit assembly. // Reads 16 bit YUV and leaves result as 8 bit. static __inline void YuvPixel16_8(uint16_t y, uint16_t u, uint16_t v, uint8_t* b, uint8_t* g, uint8_t* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; uint32_t y32 = y; u = clamp255(u >> 8); v = clamp255(v >> 8); CALC_RGB16; *b = Clamp((int32_t)(b16) >> 6); *g = Clamp((int32_t)(g16) >> 6); *r = Clamp((int32_t)(r16) >> 6); } // C reference code that mimics the YUV 16 bit assembly. // Reads 16 bit YUV and leaves result as 16 bit. static __inline void YuvPixel16_16(uint16_t y, uint16_t u, uint16_t v, int* b, int* g, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; uint32_t y32 = y; u = clamp255(u >> 8); v = clamp255(v >> 8); CALC_RGB16; *b = b16; *g = g16; *r = r16; } // C reference code that mimics the YUV assembly. // Reads 8 bit YUV and leaves result as 8 bit. static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r, const struct YuvConstants* yuvconstants) { #if defined(__aarch64__) || defined(__arm__) int yg = yuvconstants->kRGBCoeffBias[0]; int ygb = yuvconstants->kRGBCoeffBias[4]; #else int ygb = yuvconstants->kYBiasToRgb[0]; int yg = yuvconstants->kYToRgb[0]; #endif uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; *b = Clamp(((int32_t)(y1) + ygb) >> 6); *g = Clamp(((int32_t)(y1) + ygb) >> 6); *r = Clamp(((int32_t)(y1) + ygb) >> 6); } void I444ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width; ++x) { YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; src_y += 1; src_u += 1; src_v += 1; rgb_buf += 4; // Advance 1 pixel. } } // Also used for 420 void I422ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 1; src_v += 1; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } // 10 bit YUV to ARGB void I210ToARGBRow_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 1; src_v += 1; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } void I410ToARGBRow_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width; ++x) { YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; src_y += 1; src_u += 1; src_v += 1; rgb_buf += 4; // Advance 1 pixels. } } void I210AlphaToARGBRow_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, const uint16_t* src_a, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = clamp255(src_a[0] >> 2); YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = clamp255(src_a[1] >> 2); src_y += 2; src_u += 1; src_v += 1; src_a += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = clamp255(src_a[0] >> 2); } } void I410AlphaToARGBRow_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, const uint16_t* src_a, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width; ++x) { YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = clamp255(src_a[0] >> 2); src_y += 1; src_u += 1; src_v += 1; src_a += 1; rgb_buf += 4; // Advance 1 pixels. } } // 12 bit YUV to ARGB void I212ToARGBRow_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel12(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 1; src_v += 1; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel12(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) { uint32_t ar30; b = b >> 4; // convert 8 bit 10.6 to 10 bit. g = g >> 4; r = r >> 4; b = Clamp10(b); g = Clamp10(g); r = Clamp10(r); ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000; (*(uint32_t*)rgb_buf) = ar30; } // 10 bit YUV to 10 bit AR30 void I210ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; int b; int g; int r; for (x = 0; x < width - 1; x += 2) { YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf + 4, b, g, r); src_y += 2; src_u += 1; src_v += 1; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); } } // 12 bit YUV to 10 bit AR30 void I212ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; int b; int g; int r; for (x = 0; x < width - 1; x += 2) { YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); YuvPixel12_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf + 4, b, g, r); src_y += 2; src_u += 1; src_v += 1; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel12_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); } } void I410ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; int b; int g; int r; for (x = 0; x < width; ++x) { YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); src_y += 1; src_u += 1; src_v += 1; rgb_buf += 4; // Advance 1 pixel. } } // P210 has 10 bits in msb of 16 bit NV12 style layout. void P210ToARGBRow_C(const uint16_t* src_y, const uint16_t* src_uv, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, dst_argb + 2, yuvconstants); dst_argb[3] = 255; YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], dst_argb + 4, dst_argb + 5, dst_argb + 6, yuvconstants); dst_argb[7] = 255; src_y += 2; src_uv += 2; dst_argb += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, dst_argb + 2, yuvconstants); dst_argb[3] = 255; } } void P410ToARGBRow_C(const uint16_t* src_y, const uint16_t* src_uv, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width; ++x) { YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], dst_argb + 0, dst_argb + 1, dst_argb + 2, yuvconstants); dst_argb[3] = 255; src_y += 1; src_uv += 2; dst_argb += 4; // Advance 1 pixels. } } void P210ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_uv, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { int x; int b; int g; int r; for (x = 0; x < width - 1; x += 2) { YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); StoreAR30(dst_ar30, b, g, r); YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); StoreAR30(dst_ar30 + 4, b, g, r); src_y += 2; src_uv += 2; dst_ar30 += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); StoreAR30(dst_ar30, b, g, r); } } void P410ToAR30Row_C(const uint16_t* src_y, const uint16_t* src_uv, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { int x; int b; int g; int r; for (x = 0; x < width; ++x) { YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); StoreAR30(dst_ar30, b, g, r); src_y += 1; src_uv += 2; dst_ar30 += 4; // Advance 1 pixel. } } // 8 bit YUV to 10 bit AR30 // Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. void I422ToAR30Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; int b; int g; int r; for (x = 0; x < width - 1; x += 2) { YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf + 4, b, g, r); src_y += 2; src_u += 1; src_v += 1; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); } } void I444AlphaToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width; ++x) { YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; src_y += 1; src_u += 1; src_v += 1; src_a += 1; rgb_buf += 4; // Advance 1 pixel. } } void I422AlphaToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = src_a[1]; src_y += 2; src_u += 1; src_v += 1; src_a += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; } } void I422ToRGB24Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants); src_y += 2; src_u += 1; src_v += 1; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); } } void I422ToARGB4444Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { uint8_t b0; uint8_t g0; uint8_t r0; uint8_t b1; uint8_t g1; uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; b1 = b1 >> 4; g1 = g1 >> 4; r1 = r1 >> 4; *(uint16_t*)(dst_argb4444 + 0) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; *(uint16_t*)(dst_argb4444 + 2) = b1 | (g1 << 4) | (r1 << 8) | 0xf000; src_y += 2; src_u += 1; src_v += 1; dst_argb4444 += 4; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000; } } void I422ToARGB1555Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { uint8_t b0; uint8_t g0; uint8_t r0; uint8_t b1; uint8_t g1; uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; b1 = b1 >> 3; g1 = g1 >> 3; r1 = r1 >> 3; *(uint16_t*)(dst_argb1555 + 0) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; *(uint16_t*)(dst_argb1555 + 2) = b1 | (g1 << 5) | (r1 << 10) | 0x8000; src_y += 2; src_u += 1; src_v += 1; dst_argb1555 += 4; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000; } } void I422ToRGB565Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { uint8_t b0; uint8_t g0; uint8_t r0; uint8_t b1; uint8_t g1; uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11); // for ubsan *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11); src_y += 2; src_u += 1; src_v += 1; dst_rgb565 += 4; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); } } void NV12ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_uv += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } void NV21ToARGBRow_C(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_vu += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } void NV12ToRGB24Row_C(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants); src_y += 2; src_uv += 2; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); } } void NV21ToRGB24Row_C(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants); src_y += 2; src_vu += 2; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); } } void NV12ToRGB565Row_C(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { uint8_t b0; uint8_t g0; uint8_t r0; uint8_t b1; uint8_t g1; uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; b1 = b1 >> 3; g1 = g1 >> 2; r1 = r1 >> 3; *(uint16_t*)(dst_rgb565 + 0) = b0 | (g0 << 5) | (r0 << 11); *(uint16_t*)(dst_rgb565 + 2) = b1 | (g1 << 5) | (r1 << 11); src_y += 2; src_uv += 2; dst_rgb565 += 4; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); } } void YUY2ToARGBRow_C(const uint8_t* src_yuy2, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_yuy2 += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } void UYVYToARGBRow_C(const uint8_t* src_uyvy, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_uyvy += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } void I422ToRGBARow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants); rgb_buf[4] = 255; src_y += 2; src_u += 1; src_v += 1; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; } } void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { int x; for (x = 0; x < width - 1; x += 2) { YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { int x; src += width - 1; for (x = 0; x < width - 1; x += 2) { dst[x] = src[0]; dst[x + 1] = src[-1]; src -= 2; } if (width & 1) { dst[width - 1] = src[0]; } } void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) { int x; src_uv += (width - 1) << 1; for (x = 0; x < width; ++x) { dst_uv[0] = src_uv[0]; dst_uv[1] = src_uv[1]; src_uv -= 2; dst_uv += 2; } } void MirrorSplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; src_uv += (width - 1) << 1; for (x = 0; x < width - 1; x += 2) { dst_u[x] = src_uv[0]; dst_u[x + 1] = src_uv[-2]; dst_v[x] = src_uv[1]; dst_v[x + 1] = src_uv[-2 + 1]; src_uv -= 4; } if (width & 1) { dst_u[width - 1] = src_uv[0]; dst_v[width - 1] = src_uv[1]; } } void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { int x; const uint32_t* src32 = (const uint32_t*)(src); uint32_t* dst32 = (uint32_t*)(dst); src32 += width - 1; for (x = 0; x < width - 1; x += 2) { dst32[x] = src32[0]; dst32[x + 1] = src32[-1]; src32 -= 2; } if (width & 1) { dst32[width - 1] = src32[0]; } } void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { int x; src_rgb24 += width * 3 - 3; for (x = 0; x < width; ++x) { uint8_t b = src_rgb24[0]; uint8_t g = src_rgb24[1]; uint8_t r = src_rgb24[2]; dst_rgb24[0] = b; dst_rgb24[1] = g; dst_rgb24[2] = r; src_rgb24 -= 3; dst_rgb24 += 3; } } void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_u[x] = src_uv[0]; dst_u[x + 1] = src_uv[2]; dst_v[x] = src_uv[1]; dst_v[x + 1] = src_uv[3]; src_uv += 4; } if (width & 1) { dst_u[width - 1] = src_uv[0]; dst_v[width - 1] = src_uv[1]; } } void MergeUVRow_C(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_uv[0] = src_u[x]; dst_uv[1] = src_v[x]; dst_uv[2] = src_u[x + 1]; dst_uv[3] = src_v[x + 1]; dst_uv += 4; } if (width & 1) { dst_uv[0] = src_u[width - 1]; dst_uv[1] = src_v[width - 1]; } } void SplitRGBRow_C(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { int x; for (x = 0; x < width; ++x) { dst_r[x] = src_rgb[0]; dst_g[x] = src_rgb[1]; dst_b[x] = src_rgb[2]; src_rgb += 3; } } void MergeRGBRow_C(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width) { int x; for (x = 0; x < width; ++x) { dst_rgb[0] = src_r[x]; dst_rgb[1] = src_g[x]; dst_rgb[2] = src_b[x]; dst_rgb += 3; } } void SplitARGBRow_C(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width) { int x; for (x = 0; x < width; ++x) { dst_b[x] = src_argb[0]; dst_g[x] = src_argb[1]; dst_r[x] = src_argb[2]; dst_a[x] = src_argb[3]; src_argb += 4; } } void MergeARGBRow_C(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { dst_argb[0] = src_b[x]; dst_argb[1] = src_g[x]; dst_argb[2] = src_r[x]; dst_argb[3] = src_a[x]; dst_argb += 4; } } void MergeXR30Row_C(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_ar30, int depth, int width) { assert(depth >= 10); assert(depth <= 16); int x; int shift = depth - 10; uint32_t* dst_ar30_32 = (uint32_t*)dst_ar30; for (x = 0; x < width; ++x) { uint32_t r = clamp1023(src_r[x] >> shift); uint32_t g = clamp1023(src_g[x] >> shift); uint32_t b = clamp1023(src_b[x] >> shift); dst_ar30_32[x] = b | (g << 10) | (r << 20) | 0xc0000000; } } void MergeAR64Row_C(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint16_t* dst_ar64, int depth, int width) { assert(depth >= 1); assert(depth <= 16); int x; int shift = 16 - depth; int max = (1 << depth) - 1; for (x = 0; x < width; ++x) { dst_ar64[0] = ClampMax(src_b[x], max) << shift; dst_ar64[1] = ClampMax(src_g[x], max) << shift; dst_ar64[2] = ClampMax(src_r[x], max) << shift; dst_ar64[3] = ClampMax(src_a[x], max) << shift; dst_ar64 += 4; } } void MergeARGB16To8Row_C(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint8_t* dst_argb, int depth, int width) { assert(depth >= 8); assert(depth <= 16); int x; int shift = depth - 8; for (x = 0; x < width; ++x) { dst_argb[0] = clamp255(src_b[x] >> shift); dst_argb[1] = clamp255(src_g[x] >> shift); dst_argb[2] = clamp255(src_r[x] >> shift); dst_argb[3] = clamp255(src_a[x] >> shift); dst_argb += 4; } } void MergeXR64Row_C(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint16_t* dst_ar64, int depth, int width) { assert(depth >= 1); assert(depth <= 16); int x; int shift = 16 - depth; int max = (1 << depth) - 1; for (x = 0; x < width; ++x) { dst_ar64[0] = ClampMax(src_b[x], max) << shift; dst_ar64[1] = ClampMax(src_g[x], max) << shift; dst_ar64[2] = ClampMax(src_r[x], max) << shift; dst_ar64[3] = 0xffff; dst_ar64 += 4; } } void MergeXRGB16To8Row_C(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_argb, int depth, int width) { assert(depth >= 8); assert(depth <= 16); int x; int shift = depth - 8; for (x = 0; x < width; ++x) { dst_argb[0] = clamp255(src_b[x] >> shift); dst_argb[1] = clamp255(src_g[x] >> shift); dst_argb[2] = clamp255(src_r[x] >> shift); dst_argb[3] = 0xff; dst_argb += 4; } } void SplitXRGBRow_C(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { int x; for (x = 0; x < width; ++x) { dst_b[x] = src_argb[0]; dst_g[x] = src_argb[1]; dst_r[x] = src_argb[2]; src_argb += 4; } } void MergeXRGBRow_C(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width; ++x) { dst_argb[0] = src_b[x]; dst_argb[1] = src_g[x]; dst_argb[2] = src_r[x]; dst_argb[3] = 255; dst_argb += 4; } } // Convert lsb formats to msb, depending on sample depth. void MergeUVRow_16_C(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, int depth, int width) { int shift = 16 - depth; assert(depth >= 8); assert(depth <= 16); int x; for (x = 0; x < width; ++x) { dst_uv[0] = src_u[x] << shift; dst_uv[1] = src_v[x] << shift; dst_uv += 2; } } // Convert msb formats to lsb, depending on sample depth. void SplitUVRow_16_C(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width) { int shift = 16 - depth; int x; assert(depth >= 8); assert(depth <= 16); for (x = 0; x < width; ++x) { dst_u[x] = src_uv[0] >> shift; dst_v[x] = src_uv[1] >> shift; src_uv += 2; } } void MultiplyRow_16_C(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { int x; for (x = 0; x < width; ++x) { dst_y[x] = src_y[x] * scale; } } void DivideRow_16_C(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { int x; for (x = 0; x < width; ++x) { dst_y[x] = (src_y[x] * scale) >> 16; } } // Use scale to convert lsb formats to msb, depending how many bits there are: // 32768 = 9 bits // 16384 = 10 bits // 4096 = 12 bits // 256 = 16 bits void Convert16To8Row_C(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { int x; assert(scale >= 256); assert(scale <= 32768); for (x = 0; x < width; ++x) { dst_y[x] = clamp255((src_y[x] * scale) >> 16); } } // Use scale to convert lsb formats to msb, depending how many bits there are: // 1024 = 10 bits void Convert8To16Row_C(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) { int x; scale *= 0x0101; // replicates the byte. for (x = 0; x < width; ++x) { dst_y[x] = (src_y[x] * scale) >> 16; } } void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) { memcpy(dst, src, count); } void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) { memcpy(dst, src, count * 2); } void SetRow_C(uint8_t* dst, uint8_t v8, int width) { memset(dst, v8, width); } void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) { int x; for (x = 0; x < width; ++x) { memcpy(dst_argb + x * sizeof v32, &v32, sizeof v32); } } // Filter 2 rows of YUY2 UV's (422) into U and V (420). void YUY2ToUVRow_C(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { // Output a row of UV values, filtering 2 rows of YUY2. int x; for (x = 0; x < width; x += 2) { dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; src_yuy2 += 4; dst_u += 1; dst_v += 1; } } // Copy row of YUY2 UV's (422) into U and V (422). void YUY2ToUV422Row_C(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { dst_u[0] = src_yuy2[1]; dst_v[0] = src_yuy2[3]; src_yuy2 += 4; dst_u += 1; dst_v += 1; } } // Copy row of YUY2 Y's (422) into Y (420/422). void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { // Output a row of Y values. int x; for (x = 0; x < width - 1; x += 2) { dst_y[x] = src_yuy2[0]; dst_y[x + 1] = src_yuy2[2]; src_yuy2 += 4; } if (width & 1) { dst_y[width - 1] = src_yuy2[0]; } } // Filter 2 rows of UYVY UV's (422) into U and V (420). void UYVYToUVRow_C(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; src_uyvy += 4; dst_u += 1; dst_v += 1; } } // Copy row of UYVY UV's (422) into U and V (422). void UYVYToUV422Row_C(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { // Output a row of UV values. int x; for (x = 0; x < width; x += 2) { dst_u[0] = src_uyvy[0]; dst_v[0] = src_uyvy[2]; src_uyvy += 4; dst_u += 1; dst_v += 1; } } // Copy row of UYVY Y's (422) into Y (420/422). void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { // Output a row of Y values. int x; for (x = 0; x < width - 1; x += 2) { dst_y[x] = src_uyvy[1]; dst_y[x + 1] = src_uyvy[3]; src_uyvy += 4; } if (width & 1) { dst_y[width - 1] = src_uyvy[1]; } } #define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f) // Blend src_argb over src_argb1 and store to dst_argb. // dst_argb may be src_argb or src_argb1. // This code mimics the SSSE3 version for better testability. void ARGBBlendRow_C(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int x; for (x = 0; x < width - 1; x += 2) { uint32_t fb = src_argb[0]; uint32_t fg = src_argb[1]; uint32_t fr = src_argb[2]; uint32_t a = src_argb[3]; uint32_t bb = src_argb1[0]; uint32_t bg = src_argb1[1]; uint32_t br = src_argb1[2]; dst_argb[0] = BLEND(fb, bb, a); dst_argb[1] = BLEND(fg, bg, a); dst_argb[2] = BLEND(fr, br, a); dst_argb[3] = 255u; fb = src_argb[4 + 0]; fg = src_argb[4 + 1]; fr = src_argb[4 + 2]; a = src_argb[4 + 3]; bb = src_argb1[4 + 0]; bg = src_argb1[4 + 1]; br = src_argb1[4 + 2]; dst_argb[4 + 0] = BLEND(fb, bb, a); dst_argb[4 + 1] = BLEND(fg, bg, a); dst_argb[4 + 2] = BLEND(fr, br, a); dst_argb[4 + 3] = 255u; src_argb += 8; src_argb1 += 8; dst_argb += 8; } if (width & 1) { uint32_t fb = src_argb[0]; uint32_t fg = src_argb[1]; uint32_t fr = src_argb[2]; uint32_t a = src_argb[3]; uint32_t bb = src_argb1[0]; uint32_t bg = src_argb1[1]; uint32_t br = src_argb1[2]; dst_argb[0] = BLEND(fb, bb, a); dst_argb[1] = BLEND(fg, bg, a); dst_argb[2] = BLEND(fr, br, a); dst_argb[3] = 255u; } } #undef BLEND #define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8 void BlendPlaneRow_C(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width) { int x; for (x = 0; x < width - 1; x += 2) { dst[0] = UBLEND(src0[0], src1[0], alpha[0]); dst[1] = UBLEND(src0[1], src1[1], alpha[1]); src0 += 2; src1 += 2; alpha += 2; dst += 2; } if (width & 1) { dst[0] = UBLEND(src0[0], src1[0], alpha[0]); } } #undef UBLEND #if LIBYUV_ATTENUATE_DUP // This code mimics the SSSE3 version for better testability. #define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 #else #define ATTENUATE(f, a) (f * a + 128) >> 8 #endif // Multiply source RGB by alpha and store to destination. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width - 1; i += 2) { uint32_t b = src_argb[0]; uint32_t g = src_argb[1]; uint32_t r = src_argb[2]; uint32_t a = src_argb[3]; dst_argb[0] = ATTENUATE(b, a); dst_argb[1] = ATTENUATE(g, a); dst_argb[2] = ATTENUATE(r, a); dst_argb[3] = a; b = src_argb[4]; g = src_argb[5]; r = src_argb[6]; a = src_argb[7]; dst_argb[4] = ATTENUATE(b, a); dst_argb[5] = ATTENUATE(g, a); dst_argb[6] = ATTENUATE(r, a); dst_argb[7] = a; src_argb += 8; dst_argb += 8; } if (width & 1) { const uint32_t b = src_argb[0]; const uint32_t g = src_argb[1]; const uint32_t r = src_argb[2]; const uint32_t a = src_argb[3]; dst_argb[0] = ATTENUATE(b, a); dst_argb[1] = ATTENUATE(g, a); dst_argb[2] = ATTENUATE(r, a); dst_argb[3] = a; } } #undef ATTENUATE // Divide source RGB by alpha and store to destination. // b = (b * 255 + (a / 2)) / a; // g = (g * 255 + (a / 2)) / a; // r = (r * 255 + (a / 2)) / a; // Reciprocal method is off by 1 on some values. ie 125 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower. #define T(a) 0x01000000 + (0x10000 / a) const uint32_t fixed_invtbl8[256] = { 0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27), T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f), T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37), T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f), T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47), T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f), T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57), T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f), T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67), T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f), T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77), T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f), T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87), T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f), T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97), T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f), T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7), T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf), T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7), T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf), T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7), T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf), T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7), T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf), T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7), T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef), T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7), T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100}; #undef T #if LIBYUV_UNATTENUATE_DUP // This code mimics the Intel SIMD version for better testability. #define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16) #else #define UNATTENUATE(f, ia) clamp255((f * ia) >> 8) #endif // mimics the Intel SIMD code for exactness. void ARGBUnattenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int i; for (i = 0; i < width; ++i) { uint32_t b = src_argb[0]; uint32_t g = src_argb[1]; uint32_t r = src_argb[2]; const uint32_t a = src_argb[3]; const uint32_t ia = fixed_invtbl8[a] & 0xffff; // 8.8 fixed point // Clamping should not be necessary but is free in assembly. dst_argb[0] = UNATTENUATE(b, ia); dst_argb[1] = UNATTENUATE(g, ia); dst_argb[2] = UNATTENUATE(r, ia); dst_argb[3] = a; src_argb += 4; dst_argb += 4; } } void ComputeCumulativeSumRow_C(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width) { int32_t row_sum[4] = {0, 0, 0, 0}; int x; for (x = 0; x < width; ++x) { row_sum[0] += row[x * 4 + 0]; row_sum[1] += row[x * 4 + 1]; row_sum[2] += row[x * 4 + 2]; row_sum[3] += row[x * 4 + 3]; cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0]; cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1]; cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2]; cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3]; } } void CumulativeSumToAverageRow_C(const int32_t* tl, const int32_t* bl, int w, int area, uint8_t* dst, int count) { float ooa = 1.0f / area; int i; for (i = 0; i < count; ++i) { dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa); dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa); dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa); dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa); dst += 4; tl += 4; bl += 4; } } // Copy pixels from rotated source to destination row with a slope. LIBYUV_API void ARGBAffineRow_C(const uint8_t* src_argb, int src_argb_stride, uint8_t* dst_argb, const float* uv_dudv, int width) { int i; // Render a row of pixels from source into a buffer. float uv[2]; uv[0] = uv_dudv[0]; uv[1] = uv_dudv[1]; for (i = 0; i < width; ++i) { int x = (int)(uv[0]); int y = (int)(uv[1]); *(uint32_t*)(dst_argb) = *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4); dst_argb += 4; uv[0] += uv_dudv[2]; uv[1] += uv_dudv[3]; } } // Blend 2 rows into 1. static void HalfRow_C(const uint8_t* src_uv, ptrdiff_t src_uv_stride, uint8_t* dst_uv, int width) { int x; for (x = 0; x < width; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; } } static void HalfRow_16_C(const uint16_t* src_uv, ptrdiff_t src_uv_stride, uint16_t* dst_uv, int width) { int x; for (x = 0; x < width; ++x) { dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1; } } // C version 2x2 -> 2x1. void InterpolateRow_C(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; const uint8_t* src_ptr1 = src_ptr + src_stride; int x; if (y1_fraction == 0) { memcpy(dst_ptr, src_ptr, width); return; } if (y1_fraction == 128) { HalfRow_C(src_ptr, src_stride, dst_ptr, width); return; } for (x = 0; x < width - 1; x += 2) { dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction + 128) >> 8; src_ptr += 2; src_ptr1 += 2; dst_ptr += 2; } if (width & 1) { dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction + 128) >> 8; } } void InterpolateRow_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; const uint16_t* src_ptr1 = src_ptr + src_stride; int x; if (source_y_fraction == 0) { memcpy(dst_ptr, src_ptr, width * 2); return; } if (source_y_fraction == 128) { HalfRow_16_C(src_ptr, src_stride, dst_ptr, width); return; } for (x = 0; x < width - 1; x += 2) { dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; src_ptr += 2; src_ptr1 += 2; dst_ptr += 2; } if (width & 1) { dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; } } // Use first 4 shuffler values to reorder ARGB channels. void ARGBShuffleRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { int index0 = shuffler[0]; int index1 = shuffler[1]; int index2 = shuffler[2]; int index3 = shuffler[3]; // Shuffle a row of ARGB. int x; for (x = 0; x < width; ++x) { // To support in-place conversion. uint8_t b = src_argb[index0]; uint8_t g = src_argb[index1]; uint8_t r = src_argb[index2]; uint8_t a = src_argb[index3]; dst_argb[0] = b; dst_argb[1] = g; dst_argb[2] = r; dst_argb[3] = a; src_argb += 4; dst_argb += 4; } } void I422ToYUY2Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_frame, int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_frame[0] = src_y[0]; dst_frame[1] = src_u[0]; dst_frame[2] = src_y[1]; dst_frame[3] = src_v[0]; dst_frame += 4; src_y += 2; src_u += 1; src_v += 1; } if (width & 1) { dst_frame[0] = src_y[0]; dst_frame[1] = src_u[0]; dst_frame[2] = 0; dst_frame[3] = src_v[0]; } } void I422ToUYVYRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_frame, int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_frame[0] = src_u[0]; dst_frame[1] = src_y[0]; dst_frame[2] = src_v[0]; dst_frame[3] = src_y[1]; dst_frame += 4; src_y += 2; src_u += 1; src_v += 1; } if (width & 1) { dst_frame[0] = src_u[0]; dst_frame[1] = src_y[0]; dst_frame[2] = src_v[0]; dst_frame[3] = 0; } } void ARGBPolynomialRow_C(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { int i; for (i = 0; i < width; ++i) { float b = (float)(src_argb[0]); float g = (float)(src_argb[1]); float r = (float)(src_argb[2]); float a = (float)(src_argb[3]); float b2 = b * b; float g2 = g * g; float r2 = r * r; float a2 = a * a; float db = poly[0] + poly[4] * b; float dg = poly[1] + poly[5] * g; float dr = poly[2] + poly[6] * r; float da = poly[3] + poly[7] * a; float b3 = b2 * b; float g3 = g2 * g; float r3 = r2 * r; float a3 = a2 * a; db += poly[8] * b2; dg += poly[9] * g2; dr += poly[10] * r2; da += poly[11] * a2; db += poly[12] * b3; dg += poly[13] * g3; dr += poly[14] * r3; da += poly[15] * a3; dst_argb[0] = Clamp((int32_t)(db)); dst_argb[1] = Clamp((int32_t)(dg)); dst_argb[2] = Clamp((int32_t)(dr)); dst_argb[3] = Clamp((int32_t)(da)); src_argb += 4; dst_argb += 4; } } // Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor // adjust the source integer range to the half float range desired. // This magic constant is 2^-112. Multiplying by this // is the same as subtracting 112 from the exponent, which // is the difference in exponent bias between 32-bit and // 16-bit floats. Once we've done this subtraction, we can // simply extract the low bits of the exponent and the high // bits of the mantissa from our float and we're done. // Work around GCC 7 punning warning -Wstrict-aliasing #if defined(__GNUC__) typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t; #else typedef uint32_t uint32_alias_t; #endif void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width) { int i; float mult = 1.9259299444e-34f * scale; for (i = 0; i < width; ++i) { float value = src[i] * mult; dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13); } } void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) { int i; for (i = 0; i < width; ++i) { float value = src[i] * scale; dst[i] = value; } } void ARGBLumaColorTableRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width, const uint8_t* luma, uint32_t lumacoeff) { uint32_t bc = lumacoeff & 0xff; uint32_t gc = (lumacoeff >> 8) & 0xff; uint32_t rc = (lumacoeff >> 16) & 0xff; int i; for (i = 0; i < width - 1; i += 2) { // Luminance in rows, color values in columns. const uint8_t* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + luma; const uint8_t* luma1; dst_argb[0] = luma0[src_argb[0]]; dst_argb[1] = luma0[src_argb[1]]; dst_argb[2] = luma0[src_argb[2]]; dst_argb[3] = src_argb[3]; luma1 = ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) + luma; dst_argb[4] = luma1[src_argb[4]]; dst_argb[5] = luma1[src_argb[5]]; dst_argb[6] = luma1[src_argb[6]]; dst_argb[7] = src_argb[7]; src_argb += 8; dst_argb += 8; } if (width & 1) { // Luminance in rows, color values in columns. const uint8_t* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) + luma; dst_argb[0] = luma0[src_argb[0]]; dst_argb[1] = luma0[src_argb[1]]; dst_argb[2] = luma0[src_argb[2]]; dst_argb[3] = src_argb[3]; } } void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst[3] = src[3]; dst[7] = src[7]; dst += 8; src += 8; } if (width & 1) { dst[3] = src[3]; } } void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst_a[0] = src_argb[3]; dst_a[1] = src_argb[7]; dst_a += 2; src_argb += 8; } if (width & 1) { dst_a[0] = src_argb[3]; } } void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) { int i; for (i = 0; i < width - 1; i += 2) { dst[3] = src[0]; dst[7] = src[1]; dst += 8; src += 2; } if (width & 1) { dst[3] = src[0]; } } // Maximum temporary width for wrappers to process at a time, in pixels. #define MAXTWIDTH 2048 #if !(defined(_MSC_VER) && !defined(__clang__) && defined(_M_IX86)) && \ defined(HAS_I422TORGB565ROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. void I422ToRGB565Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; dst_rgb565 += twidth * 2; width -= twidth; } } #endif #if defined(HAS_I422TOARGB1555ROW_SSSE3) void I422ToARGB1555Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; dst_argb1555 += twidth * 2; width -= twidth; } } #endif #if defined(HAS_I422TOARGB4444ROW_SSSE3) void I422ToARGB4444Row_SSSE3(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; dst_argb4444 += twidth * 2; width -= twidth; } } #endif #if defined(HAS_NV12TORGB565ROW_SSSE3) void NV12ToRGB565Row_SSSE3(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); src_y += twidth; src_uv += twidth; dst_rgb565 += twidth * 2; width -= twidth; } } #endif #if defined(HAS_NV12TORGB24ROW_SSSE3) void NV12ToRGB24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth); ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); src_y += twidth; src_uv += twidth; dst_rgb24 += twidth * 3; width -= twidth; } } #endif #if defined(HAS_NV21TORGB24ROW_SSSE3) void NV21ToRGB24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth); ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); src_y += twidth; src_vu += twidth; dst_rgb24 += twidth * 3; width -= twidth; } } #endif #if defined(HAS_NV12TORGB24ROW_AVX2) void NV12ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); #if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); #else ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); #endif src_y += twidth; src_uv += twidth; dst_rgb24 += twidth * 3; width -= twidth; } } #endif #if defined(HAS_NV21TORGB24ROW_AVX2) void NV21ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth); #if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); #else ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); #endif src_y += twidth; src_vu += twidth; dst_rgb24 += twidth * 3; width -= twidth; } } #endif #if defined(HAS_I422TORGB565ROW_AVX2) void I422ToRGB565Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); #if defined(HAS_ARGBTORGB565ROW_AVX2) ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); #else ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); #endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; dst_rgb565 += twidth * 2; width -= twidth; } } #endif #if defined(HAS_I422TOARGB1555ROW_AVX2) void I422ToARGB1555Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); #if defined(HAS_ARGBTOARGB1555ROW_AVX2) ARGBToARGB1555Row_AVX2(row, dst_argb1555, twidth); #else ARGBToARGB1555Row_SSE2(row, dst_argb1555, twidth); #endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; dst_argb1555 += twidth * 2; width -= twidth; } } #endif #if defined(HAS_I422TOARGB4444ROW_AVX2) void I422ToARGB4444Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); #if defined(HAS_ARGBTOARGB4444ROW_AVX2) ARGBToARGB4444Row_AVX2(row, dst_argb4444, twidth); #else ARGBToARGB4444Row_SSE2(row, dst_argb4444, twidth); #endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; dst_argb4444 += twidth * 2; width -= twidth; } } #endif #if defined(HAS_I422TORGB24ROW_AVX2) void I422ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth); #if defined(HAS_ARGBTORGB24ROW_AVX2) ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth); #else ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); #endif src_y += twidth; src_u += twidth / 2; src_v += twidth / 2; dst_rgb24 += twidth * 3; width -= twidth; } } #endif #if defined(HAS_NV12TORGB565ROW_AVX2) void NV12ToRGB565Row_AVX2(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth); #if defined(HAS_ARGBTORGB565ROW_AVX2) ARGBToRGB565Row_AVX2(row, dst_rgb565, twidth); #else ARGBToRGB565Row_SSE2(row, dst_rgb565, twidth); #endif src_y += twidth; src_uv += twidth; dst_rgb565 += twidth * 2; width -= twidth; } } #endif #ifdef HAS_RGB24TOYJROW_AVX2 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); ARGBToYJRow_AVX2(row, dst_yj, twidth); src_rgb24 += twidth * 3; dst_yj += twidth; width -= twidth; } } #endif // HAS_RGB24TOYJROW_AVX2 #ifdef HAS_RAWTOYJROW_AVX2 // Convert 16 RAW pixels (64 bytes) to 16 YJ values. void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; RAWToARGBRow_SSSE3(src_raw, row, twidth); ARGBToYJRow_AVX2(row, dst_yj, twidth); src_raw += twidth * 3; dst_yj += twidth; width -= twidth; } } #endif // HAS_RAWTOYJROW_AVX2 #ifdef HAS_RGB24TOYJROW_SSSE3 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; RGB24ToARGBRow_SSSE3(src_rgb24, row, twidth); ARGBToYJRow_SSSE3(row, dst_yj, twidth); src_rgb24 += twidth * 3; dst_yj += twidth; width -= twidth; } } #endif // HAS_RGB24TOYJROW_SSSE3 #ifdef HAS_RAWTOYJROW_SSSE3 // Convert 16 RAW pixels (64 bytes) to 16 YJ values. void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) { // Row buffer for intermediate ARGB pixels. SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; RAWToARGBRow_SSSE3(src_raw, row, twidth); ARGBToYJRow_SSSE3(row, dst_yj, twidth); src_raw += twidth * 3; dst_yj += twidth; width -= twidth; } } #endif // HAS_RAWTOYJROW_SSSE3 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { float fsum = 0.f; int i; for (i = 0; i < width; ++i) { float v = *src++; fsum += v * v; *dst++ = v * scale; } return fsum; } float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) { float fmax = 0.f; int i; for (i = 0; i < width; ++i) { float v = *src++; float vs = v * scale; fmax = (v > fmax) ? v : fmax; *dst++ = vs; } return fmax; } void ScaleSamples_C(const float* src, float* dst, float scale, int width) { int i; for (i = 0; i < width; ++i) { *dst++ = *src++ * scale; } } void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) { int i; for (i = 0; i < width; ++i) { *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8; ++src; } } // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussCol_C(const uint16_t* src0, const uint16_t* src1, const uint16_t* src2, const uint16_t* src3, const uint16_t* src4, uint32_t* dst, int width) { int i; for (i = 0; i < width; ++i) { *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; } } void GaussRow_F32_C(const float* src, float* dst, int width) { int i; for (i = 0; i < width; ++i) { *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) * (1.0f / 256.0f); ++src; } } // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussCol_F32_C(const float* src0, const float* src1, const float* src2, const float* src3, const float* src4, float* dst, int width) { int i; for (i = 0; i < width; ++i) { *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++; } } // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_C(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_yuv24[0] = src_vu[0]; // V dst_yuv24[1] = src_vu[1]; // U dst_yuv24[2] = src_y[0]; // Y0 dst_yuv24[3] = src_vu[0]; // V dst_yuv24[4] = src_vu[1]; // U dst_yuv24[5] = src_y[1]; // Y1 src_y += 2; src_vu += 2; dst_yuv24 += 6; // Advance 2 pixels. } if (width & 1) { dst_yuv24[0] = src_vu[0]; // V dst_yuv24[1] = src_vu[1]; // U dst_yuv24[2] = src_y[0]; // Y0 } } // Filter 2 rows of AYUV UV's (444) into UV (420). // AYUV is VUYA in memory. UV for NV12 is UV order in memory. void AYUVToUVRow_C(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width) { // Output a row of UV values, filtering 2x2 rows of AYUV. int x; for (x = 0; x < width - 1; x += 2) { dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 5] + 2) >> 2; dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 4] + 2) >> 2; src_ayuv += 8; dst_uv += 2; } if (width & 1) { dst_uv[0] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1; dst_uv[1] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1; } } // Filter 2 rows of AYUV UV's (444) into VU (420). void AYUVToVURow_C(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_vu, int width) { // Output a row of VU values, filtering 2x2 rows of AYUV. int x; for (x = 0; x < width - 1; x += 2) { dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 4] + 2) >> 2; dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 5] + 2) >> 2; src_ayuv += 8; dst_vu += 2; } if (width & 1) { dst_vu[0] = (src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + 1) >> 1; dst_vu[1] = (src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + 1) >> 1; } } // Copy row of AYUV Y's into Y void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { // Output a row of Y values. int x; for (x = 0; x < width; ++x) { dst_y[x] = src_ayuv[2]; // v,u,y,a src_ayuv += 4; } } // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) { int x; for (x = 0; x < width; ++x) { uint8_t u = src_uv[0]; uint8_t v = src_uv[1]; dst_vu[0] = v; dst_vu[1] = u; src_uv += 2; dst_vu += 2; } } void HalfMergeUVRow_C(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int width) { int x; for (x = 0; x < width - 1; x += 2) { dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] + src_u[src_stride_u + 1] + 2) >> 2; dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] + src_v[src_stride_v + 1] + 2) >> 2; src_u += 2; src_v += 2; dst_uv += 2; } if (width & 1) { dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1; dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1; } } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/row_gcc.cc000066400000000000000000014167571416500237200210050ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for GCC x86 and x64. #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) // Constants for ARGB static const uvec8 kARGBToY = {25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u}; // JPeg full range. static const uvec8 kARGBToYJ = {29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u}; static const uvec8 kRGBAToYJ = {0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u, 0u, 29u, 150u, 77u}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0}; static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0}; static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0}; static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0}; // Constants for BGRA static const uvec8 kBGRAToY = {0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u}; static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112}; static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18}; // Constants for ABGR static const uvec8 kABGRToY = {66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u, 66u, 129u, 25u, 0u}; static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0}; static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0}; // Constants for RGBA. static const uvec8 kRGBAToY = {0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u, 0u, 25u, 129u, 66u}; static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38}; static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112}; static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u}; static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) #ifdef HAS_RGB24TOARGBROW_SSSE3 // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; // Shuffle table for converting RAW to ARGB. static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; // Shuffle table for converting RAW to RGBA. static const uvec8 kShuffleMaskRAWToRGBA = {12u, 2u, 1u, 0u, 13u, 5u, 4u, 3u, 14u, 8u, 7u, 6u, 15u, 11u, 10u, 9u}; // Shuffle table for converting RAW to RGB24. First 8. static const uvec8 kShuffleMaskRAWToRGB24_0 = { 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Middle 8. static const uvec8 kShuffleMaskRAWToRGB24_1 = { 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Last 8. static const uvec8 kShuffleMaskRAWToRGB24_2 = { 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RGB24. static const uvec8 kShuffleMaskARGBToRGB24 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RAW. static const uvec8 kShuffleMaskARGBToRAW = { 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 static const uvec8 kShuffleMaskARGBToRGB24_0 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; // YUY2 shuf 16 Y to 32 Y. static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; // YUY2 shuf 8 UV to 16 UV. static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; // UYVY shuf 16 Y to 32 Y. static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; // UYVY shuf 8 UV to 16 UV. static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; // NV21 shuf 8 VU to 16 UV. static const lvec8 kShuffleNV21 = { 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, }; #endif // HAS_RGB24TOARGBROW_SSSE3 #ifdef HAS_J400TOARGBROW_SSE2 void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" "lea 0x8(%0),%0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm5,%%xmm0 \n" "por %%xmm5,%%xmm1 \n" "movdqu %%xmm0,(%1) \n" "movdqu %%xmm1,0x10(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 ::"memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_J400TOARGBROW_SSE2 #ifdef HAS_RGB24TOARGBROW_SSSE3 void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm3 \n" "lea 0x30(%0),%0 \n" "movdqa %%xmm3,%%xmm2 \n" "palignr $0x8,%%xmm1,%%xmm2 \n" "pshufb %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" "movdqu %%xmm2,0x20(%1) \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" "movdqu %%xmm0,(%1) \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" "movdqu %%xmm1,0x10(%1) \n" "por %%xmm5,%%xmm3 \n" "movdqu %%xmm3,0x30(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "m"(kShuffleMaskRGB24ToARGB) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" // 0xff000000 "pslld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm3 \n" "lea 0x30(%0),%0 \n" "movdqa %%xmm3,%%xmm2 \n" "palignr $0x8,%%xmm1,%%xmm2 \n" "pshufb %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" "movdqu %%xmm2,0x20(%1) \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" "movdqu %%xmm0,(%1) \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" "movdqu %%xmm1,0x10(%1) \n" "por %%xmm5,%%xmm3 \n" "movdqu %%xmm3,0x30(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "m"(kShuffleMaskRAWToARGB) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } // Same code as RAWToARGB with different shuffler and A in low bits void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff "psrld $0x18,%%xmm5 \n" "movdqa %3,%%xmm4 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm3 \n" "lea 0x30(%0),%0 \n" "movdqa %%xmm3,%%xmm2 \n" "palignr $0x8,%%xmm1,%%xmm2 \n" "pshufb %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm2 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" "movdqu %%xmm2,0x20(%1) \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" "movdqu %%xmm0,(%1) \n" "por %%xmm5,%%xmm1 \n" "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" "movdqu %%xmm1,0x10(%1) \n" "por %%xmm5,%%xmm3 \n" "movdqu %%xmm3,0x30(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgba), // %1 "+r"(width) // %2 : "m"(kShuffleMaskRAWToRGBA) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "movdqa %3,%%xmm3 \n" "movdqa %4,%%xmm4 \n" "movdqa %5,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x4(%0),%%xmm1 \n" "movdqu 0x8(%0),%%xmm2 \n" "lea 0x18(%0),%0 \n" "pshufb %%xmm3,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" "pshufb %%xmm5,%%xmm2 \n" "movq %%xmm0,(%1) \n" "movq %%xmm1,0x8(%1) \n" "movq %%xmm2,0x10(%1) \n" "lea 0x18(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 : "m"(kShuffleMaskRAWToRGB24_0), // %3 "m"(kShuffleMaskRAWToRGB24_1), // %4 "m"(kShuffleMaskRAWToRGB24_2) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "mov $0x1080108,%%eax \n" "movd %%eax,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "mov $0x20802080,%%eax \n" "movd %%eax,%%xmm6 \n" "pshufd $0x0,%%xmm6,%%xmm6 \n" "pcmpeqb %%xmm3,%%xmm3 \n" "psllw $0xb,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "psllw $0xa,%%xmm4 \n" "psrlw $0x5,%%xmm4 \n" "pcmpeqb %%xmm7,%%xmm7 \n" "psllw $0x8,%%xmm7 \n" "sub %0,%1 \n" "sub %0,%1 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "pand %%xmm3,%%xmm1 \n" "psllw $0xb,%%xmm2 \n" "pmulhuw %%xmm5,%%xmm1 \n" "pmulhuw %%xmm5,%%xmm2 \n" "psllw $0x8,%%xmm1 \n" "por %%xmm2,%%xmm1 \n" "pand %%xmm4,%%xmm0 \n" "pmulhuw %%xmm6,%%xmm0 \n" "por %%xmm7,%%xmm0 \n" "movdqa %%xmm1,%%xmm2 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" "movdqu %%xmm1,0x00(%1,%0,2) \n" "movdqu %%xmm2,0x10(%1,%0,2) \n" "lea 0x10(%0),%0 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "mov $0x1080108,%%eax \n" "movd %%eax,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "mov $0x42004200,%%eax \n" "movd %%eax,%%xmm6 \n" "pshufd $0x0,%%xmm6,%%xmm6 \n" "pcmpeqb %%xmm3,%%xmm3 \n" "psllw $0xb,%%xmm3 \n" "movdqa %%xmm3,%%xmm4 \n" "psrlw $0x6,%%xmm4 \n" "pcmpeqb %%xmm7,%%xmm7 \n" "psllw $0x8,%%xmm7 \n" "sub %0,%1 \n" "sub %0,%1 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "psllw $0x1,%%xmm1 \n" "psllw $0xb,%%xmm2 \n" "pand %%xmm3,%%xmm1 \n" "pmulhuw %%xmm5,%%xmm2 \n" "pmulhuw %%xmm5,%%xmm1 \n" "psllw $0x8,%%xmm1 \n" "por %%xmm2,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "pand %%xmm4,%%xmm0 \n" "psraw $0x8,%%xmm2 \n" "pmulhuw %%xmm6,%%xmm0 \n" "pand %%xmm7,%%xmm2 \n" "por %%xmm2,%%xmm0 \n" "movdqa %%xmm1,%%xmm2 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" "movdqu %%xmm1,0x00(%1,%0,2) \n" "movdqu %%xmm2,0x10(%1,%0,2) \n" "lea 0x10(%0),%0 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "mov $0xf0f0f0f,%%eax \n" "movd %%eax,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" "pslld $0x4,%%xmm5 \n" "sub %0,%1 \n" "sub %0,%1 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm2 \n" "pand %%xmm4,%%xmm0 \n" "pand %%xmm5,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm2,%%xmm3 \n" "psllw $0x4,%%xmm1 \n" "psrlw $0x4,%%xmm3 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm3,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" "movdqu %%xmm0,0x00(%1,%0,2) \n" "movdqu %%xmm1,0x10(%1,%0,2) \n" "lea 0x10(%0),%0 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "movdqa %3,%%xmm6 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "movdqu 0x30(%0),%%xmm3 \n" "lea 0x40(%0),%0 \n" "pshufb %%xmm6,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" "pshufb %%xmm6,%%xmm2 \n" "pshufb %%xmm6,%%xmm3 \n" "movdqa %%xmm1,%%xmm4 \n" "psrldq $0x4,%%xmm1 \n" "pslldq $0xc,%%xmm4 \n" "movdqa %%xmm2,%%xmm5 \n" "por %%xmm4,%%xmm0 \n" "pslldq $0x8,%%xmm5 \n" "movdqu %%xmm0,(%1) \n" "por %%xmm5,%%xmm1 \n" "psrldq $0x8,%%xmm2 \n" "pslldq $0x4,%%xmm3 \n" "por %%xmm3,%%xmm2 \n" "movdqu %%xmm1,0x10(%1) \n" "movdqu %%xmm2,0x20(%1) \n" "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(kShuffleMaskARGBToRGB24) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "movdqa %3,%%xmm6 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "movdqu 0x30(%0),%%xmm3 \n" "lea 0x40(%0),%0 \n" "pshufb %%xmm6,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" "pshufb %%xmm6,%%xmm2 \n" "pshufb %%xmm6,%%xmm3 \n" "movdqa %%xmm1,%%xmm4 \n" "psrldq $0x4,%%xmm1 \n" "pslldq $0xc,%%xmm4 \n" "movdqa %%xmm2,%%xmm5 \n" "por %%xmm4,%%xmm0 \n" "pslldq $0x8,%%xmm5 \n" "movdqu %%xmm0,(%1) \n" "por %%xmm5,%%xmm1 \n" "psrldq $0x8,%%xmm2 \n" "pslldq $0x4,%%xmm3 \n" "por %%xmm3,%%xmm2 \n" "movdqu %%xmm1,0x10(%1) \n" "movdqu %%xmm2,0x20(%1) \n" "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(kShuffleMaskARGBToRAW) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #ifdef HAS_ARGBTORGB24ROW_AVX2 // vpermd for 12+12 to 24 static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7}; void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm6 \n" "vmovdqa %4,%%ymm7 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" "vmovdqu 0x60(%0),%%ymm3 \n" "lea 0x80(%0),%0 \n" "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes "vpermd %%ymm1,%%ymm7,%%ymm1 \n" "vpermd %%ymm2,%%ymm7,%%ymm2 \n" "vpermd %%ymm3,%%ymm7,%%ymm3 \n" "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 "vpor %%ymm4,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 "vpermq $0x4f,%%ymm2,%%ymm4 \n" "vpor %%ymm4,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm1,0x20(%1) \n" "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 "vpermq $0x93,%%ymm3,%%ymm3 \n" "vpor %%ymm3,%%ymm2,%%ymm2 \n" "vmovdqu %%ymm2,0x40(%1) \n" "lea 0x60(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(kShuffleMaskARGBToRGB24), // %3 "m"(kPermdRGB24_AVX) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_ARGBTORGB24ROW_AVX512VBMI // Shuffle table for converting ARGBToRGB24 static const ulvec8 kPermARGBToRGB24_0 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u}; static const ulvec8 kPermARGBToRGB24_1 = { 10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u}; static const ulvec8 kPermARGBToRGB24_2 = { 21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u}; void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vmovdqa %3,%%ymm5 \n" "vmovdqa %4,%%ymm6 \n" "vmovdqa %5,%%ymm7 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" "vmovdqu 0x60(%0),%%ymm3 \n" "lea 0x80(%0),%0 \n" "vpermt2b %%ymm1,%%ymm5,%%ymm0 \n" "vpermt2b %%ymm2,%%ymm6,%%ymm1 \n" "vpermt2b %%ymm3,%%ymm7,%%ymm2 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" "vmovdqu %%ymm2,0x40(%1) \n" "lea 0x60(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(kPermARGBToRGB24_0), // %3 "m"(kPermARGBToRGB24_1), // %4 "m"(kPermARGBToRGB24_2) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_ARGBTORAWROW_AVX2 void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm6 \n" "vmovdqa %4,%%ymm7 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" "vmovdqu 0x60(%0),%%ymm3 \n" "lea 0x80(%0),%0 \n" "vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0 "vpshufb %%ymm6,%%ymm1,%%ymm1 \n" "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" "vpshufb %%ymm6,%%ymm3,%%ymm3 \n" "vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes "vpermd %%ymm1,%%ymm7,%%ymm1 \n" "vpermd %%ymm2,%%ymm7,%%ymm2 \n" "vpermd %%ymm3,%%ymm7,%%ymm3 \n" "vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8 "vpor %%ymm4,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16 "vpermq $0x4f,%%ymm2,%%ymm4 \n" "vpor %%ymm4,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm1,0x20(%1) \n" "vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24 "vpermq $0x93,%%ymm3,%%ymm3 \n" "vpor %%ymm3,%%ymm2,%%ymm2 \n" "vmovdqu %%ymm2,0x40(%1) \n" "lea 0x60(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(kShuffleMaskARGBToRAW), // %3 "m"(kPermdRGB24_AVX) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "pcmpeqb %%xmm3,%%xmm3 \n" "psrld $0x1b,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "psrld $0x1a,%%xmm4 \n" "pslld $0x5,%%xmm4 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0xb,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "pslld $0x8,%%xmm0 \n" "psrld $0x3,%%xmm1 \n" "psrld $0x5,%%xmm2 \n" "psrad $0x10,%%xmm0 \n" "pand %%xmm3,%%xmm1 \n" "pand %%xmm4,%%xmm2 \n" "pand %%xmm5,%%xmm0 \n" "por %%xmm2,%%xmm1 \n" "por %%xmm1,%%xmm0 \n" "packssdw %%xmm0,%%xmm0 \n" "lea 0x10(%0),%0 \n" "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 ::"memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void ARGBToRGB565DitherRow_SSE2(const uint8_t* src, uint8_t* dst, const uint32_t dither4, int width) { asm volatile( "movd %3,%%xmm6 \n" "punpcklbw %%xmm6,%%xmm6 \n" "movdqa %%xmm6,%%xmm7 \n" "punpcklwd %%xmm6,%%xmm6 \n" "punpckhwd %%xmm7,%%xmm7 \n" "pcmpeqb %%xmm3,%%xmm3 \n" "psrld $0x1b,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "psrld $0x1a,%%xmm4 \n" "pslld $0x5,%%xmm4 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0xb,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "paddusb %%xmm6,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "pslld $0x8,%%xmm0 \n" "psrld $0x3,%%xmm1 \n" "psrld $0x5,%%xmm2 \n" "psrad $0x10,%%xmm0 \n" "pand %%xmm3,%%xmm1 \n" "pand %%xmm4,%%xmm2 \n" "pand %%xmm5,%%xmm0 \n" "por %%xmm2,%%xmm1 \n" "por %%xmm1,%%xmm0 \n" "packssdw %%xmm0,%%xmm0 \n" "lea 0x10(%0),%0 \n" "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(dither4) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 void ARGBToRGB565DitherRow_AVX2(const uint8_t* src, uint8_t* dst, const uint32_t dither4, int width) { asm volatile( "vbroadcastss %3,%%xmm6 \n" "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n" "vpermq $0xd8,%%ymm6,%%ymm6 \n" "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n" "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" "vpsrld $0x1b,%%ymm3,%%ymm3 \n" "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrld $0x1a,%%ymm4,%%ymm4 \n" "vpslld $0x5,%%ymm4,%%ymm4 \n" "vpslld $0xb,%%ymm3,%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n" "vpsrld $0x5,%%ymm0,%%ymm2 \n" "vpsrld $0x3,%%ymm0,%%ymm1 \n" "vpsrld $0x8,%%ymm0,%%ymm0 \n" "vpand %%ymm4,%%ymm2,%%ymm2 \n" "vpand %%ymm3,%%ymm1,%%ymm1 \n" "vpand %%ymm5,%%ymm0,%%ymm0 \n" "vpor %%ymm2,%%ymm1,%%ymm1 \n" "vpor %%ymm1,%%ymm0,%%ymm0 \n" "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "lea 0x20(%0),%0 \n" "vmovdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(dither4) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBTORGB565DITHERROW_AVX2 void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psrld $0x1b,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" "pslld $0x5,%%xmm5 \n" "movdqa %%xmm4,%%xmm6 \n" "pslld $0xa,%%xmm6 \n" "pcmpeqb %%xmm7,%%xmm7 \n" "pslld $0xf,%%xmm7 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm0,%%xmm3 \n" "psrad $0x10,%%xmm0 \n" "psrld $0x3,%%xmm1 \n" "psrld $0x6,%%xmm2 \n" "psrld $0x9,%%xmm3 \n" "pand %%xmm7,%%xmm0 \n" "pand %%xmm4,%%xmm1 \n" "pand %%xmm5,%%xmm2 \n" "pand %%xmm6,%%xmm3 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm3,%%xmm2 \n" "por %%xmm2,%%xmm0 \n" "packssdw %%xmm0,%%xmm0 \n" "lea 0x10(%0),%0 \n" "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 ::"memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psllw $0xc,%%xmm4 \n" "movdqa %%xmm4,%%xmm3 \n" "psrlw $0x8,%%xmm3 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm3,%%xmm0 \n" "pand %%xmm4,%%xmm1 \n" "psrlq $0x4,%%xmm0 \n" "psrlq $0x8,%%xmm1 \n" "por %%xmm1,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "lea 0x10(%0),%0 \n" "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 ::"memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_RGB24TOARGBROW_SSSE3 /* ARGBToAR30Row: Red Blue With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats wanted for the blue channel. The red needs to be shifted 4 left, so multiply by (1024+4)*16 for red. Alpha Green Alpha and Green are already in the high bits so vpand can zero out the other bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha would be a simple multiplier to shift it into position. It wants a gap of 10 above the green. Green is 10 bits, so there are 6 bits in the low short. 4 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits, and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the result left 10 to position the A and G channels. */ // Shuffle table for converting RAW to RGB24. Last 8. static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u, 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u}; static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028; static const uint32_t kMaskRB10 = 0x3ff003ff; static const uint32_t kMaskAG10 = 0xc000ff00; static const uint32_t kMulAG10 = 64 * 65536 + 1028; void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "movdqa %3,%%xmm2 \n" // shuffler for RB "movd %4,%%xmm3 \n" // multipler for RB "movd %5,%%xmm4 \n" // mask for R10 B10 "movd %6,%%xmm5 \n" // mask for AG "movd %7,%%xmm6 \n" // multipler for AG "pshufd $0x0,%%xmm3,%%xmm3 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm6,%%xmm6 \n" "sub %0,%1 \n" "1: \n" "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm2,%%xmm1 \n" // R0B0 "pand %%xmm5,%%xmm0 \n" // A0G0 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels "add $0x10,%0 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(kShuffleRB30), // %3 "m"(kMulRB10), // %4 "m"(kMaskRB10), // %5 "m"(kMaskAG10), // %6 "m"(kMulAG10) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "movdqa %3,%%xmm2 \n" // shuffler for RB "movd %4,%%xmm3 \n" // multipler for RB "movd %5,%%xmm4 \n" // mask for R10 B10 "movd %6,%%xmm5 \n" // mask for AG "movd %7,%%xmm6 \n" // multipler for AG "pshufd $0x0,%%xmm3,%%xmm3 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm6,%%xmm6 \n" "sub %0,%1 \n" "1: \n" "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm2,%%xmm1 \n" // R0B0 "pand %%xmm5,%%xmm0 \n" // A0G0 "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels "add $0x10,%0 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(kShuffleBR30), // %3 reversed shuffler "m"(kMulRB10), // %4 "m"(kMaskRB10), // %5 "m"(kMaskAG10), // %6 "m"(kMulAG10) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #ifdef HAS_ARGBTOAR30ROW_AVX2 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastss %4,%%ymm3 \n" // multipler for RB "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 "vbroadcastss %6,%%ymm5 \n" // mask for AG "vbroadcastss %7,%%ymm6 \n" // multipler for AG "sub %0,%1 \n" "1: \n" "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels "add $0x20,%0 \n" "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(kShuffleRB30), // %3 "m"(kMulRB10), // %4 "m"(kMaskRB10), // %5 "m"(kMaskAG10), // %6 "m"(kMulAG10) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif #ifdef HAS_ABGRTOAR30ROW_AVX2 void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastss %4,%%ymm3 \n" // multipler for RB "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 "vbroadcastss %6,%%ymm5 \n" // mask for AG "vbroadcastss %7,%%ymm6 \n" // multipler for AG "sub %0,%1 \n" "1: \n" "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels "add $0x20,%0 \n" "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(kShuffleBR30), // %3 reversed shuffler "m"(kMulRB10), // %4 "m"(kMaskRB10), // %5 "m"(kMaskAG10), // %6 "m"(kMulAG10) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7}; static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}; void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { asm volatile( LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm1 \n" "movdqu %%xmm0,(%1) \n" "movdqu %%xmm1,0x10(%1) \n" "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_ar64), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1"); } void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { asm volatile( "movdqa %3,%%xmm2 \n" "movdqa %4,%%xmm3 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm2,%%xmm0 \n" "pshufb %%xmm3,%%xmm1 \n" "movdqu %%xmm0,(%1) \n" "movdqu %%xmm1,0x10(%1) \n" "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_ab64), // %1 "+r"(width) // %2 : "m"(kShuffleARGBToAB64Lo), // %3 "m"(kShuffleARGBToAB64Hi) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { asm volatile( LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "psrlw $8,%%xmm0 \n" "psrlw $8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x20(%0),%0 \n" "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_ar64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1"); } void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { asm volatile( "movdqa %3,%%xmm2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "psrlw $8,%%xmm0 \n" "psrlw $8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "pshufb %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x20(%0),%0 \n" "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "m"(kShuffleARGBToABGR) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #ifdef HAS_ARGBTOAR64ROW_AVX2 void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { asm volatile( LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" "lea 0x20(%0),%0 \n" "lea 0x40(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_ar64), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1"); } #endif #ifdef HAS_ARGBTOAB64ROW_AVX2 void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" "vbroadcastf128 %4,%%ymm3 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpshufb %%ymm3,%%ymm0,%%ymm1 \n" "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" "lea 0x20(%0),%0 \n" "lea 0x40(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_ab64), // %1 "+r"(width) // %2 : "m"(kShuffleARGBToAB64Lo), // %3 "m"(kShuffleARGBToAB64Hi) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif #ifdef HAS_AR64TOARGBROW_AVX2 void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { asm volatile( LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpsrlw $8,%%ymm0,%%ymm0 \n" "vpsrlw $8,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x40(%0),%0 \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_ar64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1"); } #endif #ifdef HAS_AB64TOARGBROW_AVX2 void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpsrlw $8,%%ymm0,%%ymm0 \n" "vpsrlw $8,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x40(%0),%0 \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "m"(kShuffleARGBToABGR) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // clang-format off // TODO(mraptis): Consider passing R, G, B multipliers as parameter. // round parameter is register containing value to add before shift. #define RGBTOY(round) \ "1: \n" \ "movdqu (%0),%%xmm0 \n" \ "movdqu 0x10(%0),%%xmm1 \n" \ "movdqu 0x20(%0),%%xmm2 \n" \ "movdqu 0x30(%0),%%xmm3 \n" \ "psubb %%xmm5,%%xmm0 \n" \ "psubb %%xmm5,%%xmm1 \n" \ "psubb %%xmm5,%%xmm2 \n" \ "psubb %%xmm5,%%xmm3 \n" \ "movdqu %%xmm4,%%xmm6 \n" \ "pmaddubsw %%xmm0,%%xmm6 \n" \ "movdqu %%xmm4,%%xmm0 \n" \ "pmaddubsw %%xmm1,%%xmm0 \n" \ "movdqu %%xmm4,%%xmm1 \n" \ "pmaddubsw %%xmm2,%%xmm1 \n" \ "movdqu %%xmm4,%%xmm2 \n" \ "pmaddubsw %%xmm3,%%xmm2 \n" \ "lea 0x40(%0),%0 \n" \ "phaddw %%xmm0,%%xmm6 \n" \ "phaddw %%xmm2,%%xmm1 \n" \ "prefetcht0 1280(%0) \n" \ "paddw %%" #round ",%%xmm6 \n" \ "paddw %%" #round ",%%xmm1 \n" \ "psrlw $0x8,%%xmm6 \n" \ "psrlw $0x8,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm6 \n" \ "movdqu %%xmm6,(%1) \n" \ "lea 0x10(%1),%1 \n" \ "sub $0x10,%2 \n" \ "jg 1b \n" #define RGBTOY_AVX2(round) \ "1: \n" \ "vmovdqu (%0),%%ymm0 \n" \ "vmovdqu 0x20(%0),%%ymm1 \n" \ "vmovdqu 0x40(%0),%%ymm2 \n" \ "vmovdqu 0x60(%0),%%ymm3 \n" \ "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \ "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \ "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \ "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \ "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \ "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \ "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \ "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \ "lea 0x80(%0),%0 \n" \ "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ "prefetcht0 1280(%0) \n" \ "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \ "vmovdqu %%ymm0,(%1) \n" \ "lea 0x20(%1),%1 \n" \ "sub $0x20,%2 \n" \ "jg 1b \n" \ "vzeroupper \n" // clang-format on #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToY), // %3 "m"(kSub128), // %4 "m"(kAddY16) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_ARGBTOYJROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToYJ), // %3 "m"(kSub128) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBTOYJROW_SSSE3 #ifdef HAS_RGBATOYJROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16. void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN RGBTOY(xmm5) : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kRGBAToYJ), // %3 "m"(kSub128) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_RGBATOYJROW_SSSE3 #if defined(HAS_ARGBTOYROW_AVX2) || defined(HAS_ARGBEXTRACTALPHAROW_AVX2) // vpermd for vphaddw + vpackuswb vpermd. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; #endif #ifdef HAS_ARGBTOYROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(ymm7) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToY), // %3 "m"(kSub128), // %4 "m"(kAddY16), // %5 "m"(kPermdARGBToY_AVX) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBTOYROW_AVX2 #ifdef HAS_ABGRTOYROW_AVX2 // Convert 32 ABGR pixels (128 bytes) to 32 Y values. void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vbroadcastf128 %5,%%ymm7 \n" "vmovdqu %6,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(ymm7) : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kABGRToY), // %3 "m"(kSub128), // %4 "m"(kAddY16), // %5 "m"(kPermdARGBToY_AVX) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ABGRTOYROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2(ymm5) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToYJ), // %3 "m"(kSub128), // %4 "m"(kPermdARGBToY_AVX) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBTOYJROW_AVX2 #ifdef HAS_RGBATOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" LABELALIGN RGBTOY_AVX2( ymm5) "vzeroupper \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kRGBAToYJ), // %3 "m"(kSub128), // %4 "m"(kPermdARGBToY_AVX) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_RGBATOYJROW_AVX2 #ifdef HAS_ARGBTOUVROW_SSSE3 void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "movdqu 0x20(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm2 \n" "movdqu 0x30(%0),%%xmm6 \n" "movdqu 0x30(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "lea 0x40(%0),%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" "pavgb %%xmm7,%%xmm0 \n" "movdqa %%xmm2,%%xmm7 \n" "shufps $0x88,%%xmm6,%%xmm2 \n" "shufps $0xdd,%%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm2,%%xmm6 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm6 \n" "phaddw %%xmm2,%%xmm0 \n" "phaddw %%xmm6,%%xmm1 \n" "psraw $0x8,%%xmm0 \n" "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 : "r"((intptr_t)(src_stride_argb)), // %4 "m"(kARGBToV), // %5 "m"(kARGBToU), // %6 "m"(kAddUV128) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_AVX2 // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" "vmovdqu 0x60(%0),%%ymm3 \n" "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" "lea 0x80(%0),%0 \n" "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" "vpsraw $0x8,%%ymm1,%%ymm1 \n" "vpsraw $0x8,%%ymm0,%%ymm0 \n" "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpshufb %8,%%ymm0,%%ymm0 \n" "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 : "r"((intptr_t)(src_stride_argb)), // %4 "m"(kAddUV128), // %5 "m"(kARGBToV), // %6 "m"(kARGBToU), // %7 "m"(kShufARGBToUV_AVX) // %8 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ABGRTOUVROW_AVX2 void ABGRToUVRow_AVX2(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" "vmovdqu 0x60(%0),%%ymm3 \n" "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" "lea 0x80(%0),%0 \n" "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" "vpsraw $0x8,%%ymm1,%%ymm1 \n" "vpsraw $0x8,%%ymm0,%%ymm0 \n" "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpshufb %8,%%ymm0,%%ymm0 \n" "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_abgr), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 : "r"((intptr_t)(src_stride_abgr)), // %4 "m"(kAddUV128), // %5 "m"(kABGRToV), // %6 "m"(kABGRToU), // %7 "m"(kShufARGBToUV_AVX) // %8 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ABGRTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 void ARGBToUVJRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %6,%%ymm6 \n" "vbroadcastf128 %7,%%ymm7 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x40(%0),%%ymm2 \n" "vmovdqu 0x60(%0),%%ymm3 \n" "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" "vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n" "vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n" "lea 0x80(%0),%0 \n" "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" "vpsraw $0x8,%%ymm1,%%ymm1 \n" "vpsraw $0x8,%%ymm0,%%ymm0 \n" "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpshufb %8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n" "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 : "r"((intptr_t)(src_stride_argb)), // %4 "m"(kSub128), // %5 "m"(kARGBToVJ), // %6 "m"(kARGBToUJ), // %7 "m"(kShufARGBToUV_AVX) // %8 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVJROW_AVX2 #ifdef HAS_ARGBTOUVJROW_SSSE3 void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "movdqu 0x20(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm2 \n" "movdqu 0x30(%0),%%xmm6 \n" "movdqu 0x30(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "lea 0x40(%0),%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" "pavgb %%xmm7,%%xmm0 \n" "movdqa %%xmm2,%%xmm7 \n" "shufps $0x88,%%xmm6,%%xmm2 \n" "shufps $0xdd,%%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm2,%%xmm6 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm6 \n" "phaddw %%xmm2,%%xmm0 \n" "phaddw %%xmm6,%%xmm1 \n" "paddw %%xmm5,%%xmm0 \n" "paddw %%xmm5,%%xmm1 \n" "psraw $0x8,%%xmm0 \n" "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 : "r"((intptr_t)(src_stride_argb)), // %4 "m"(kARGBToVJ), // %5 "m"(kARGBToUJ), // %6 "m"(kSub128) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVJROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_SSSE3 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "movdqa %4,%%xmm3 \n" "movdqa %5,%%xmm4 \n" "movdqa %6,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "movdqu 0x30(%0),%%xmm6 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm6 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm6,%%xmm2 \n" "psraw $0x8,%%xmm0 \n" "psraw $0x8,%%xmm2 \n" "packsswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "movdqu 0x30(%0),%%xmm6 \n" "pmaddubsw %%xmm3,%%xmm0 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm2 \n" "pmaddubsw %%xmm3,%%xmm6 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm6,%%xmm2 \n" "psraw $0x8,%%xmm0 \n" "psraw $0x8,%%xmm2 \n" "packsswb %%xmm2,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "lea 0x40(%0),%0 \n" "movdqu %%xmm0,0x00(%1,%2,1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 : "m"(kARGBToV), // %4 "m"(kARGBToU), // %5 "m"(kAddUV128) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6"); } #endif // HAS_ARGBTOUV444ROW_SSSE3 void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kBGRAToY), // %3 "m"(kSub128), // %4 "m"(kAddY16) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "movdqu 0x20(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm2 \n" "movdqu 0x30(%0),%%xmm6 \n" "movdqu 0x30(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "lea 0x40(%0),%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" "pavgb %%xmm7,%%xmm0 \n" "movdqa %%xmm2,%%xmm7 \n" "shufps $0x88,%%xmm6,%%xmm2 \n" "shufps $0xdd,%%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm2,%%xmm6 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm6 \n" "phaddw %%xmm2,%%xmm0 \n" "phaddw %%xmm6,%%xmm1 \n" "psraw $0x8,%%xmm0 \n" "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 : "r"((intptr_t)(src_stride_bgra)), // %4 "m"(kBGRAToV), // %5 "m"(kBGRAToU), // %6 "m"(kAddUV128) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kABGRToY), // %3 "m"(kSub128), // %4 "m"(kAddY16) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" LABELALIGN RGBTOY(xmm7) : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kRGBAToY), // %3 "m"(kSub128), // %4 "m"(kAddY16) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "movdqu 0x20(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm2 \n" "movdqu 0x30(%0),%%xmm6 \n" "movdqu 0x30(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "lea 0x40(%0),%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" "pavgb %%xmm7,%%xmm0 \n" "movdqa %%xmm2,%%xmm7 \n" "shufps $0x88,%%xmm6,%%xmm2 \n" "shufps $0xdd,%%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm2,%%xmm6 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm6 \n" "phaddw %%xmm2,%%xmm0 \n" "phaddw %%xmm6,%%xmm1 \n" "psraw $0x8,%%xmm0 \n" "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 : "r"((intptr_t)(src_stride_abgr)), // %4 "m"(kABGRToV), // %5 "m"(kABGRToU), // %6 "m"(kAddUV128) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "movdqa %5,%%xmm3 \n" "movdqa %6,%%xmm4 \n" "movdqa %7,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "movdqu 0x20(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm2 \n" "movdqu 0x30(%0),%%xmm6 \n" "movdqu 0x30(%0,%4,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "lea 0x40(%0),%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm7 \n" "pavgb %%xmm7,%%xmm0 \n" "movdqa %%xmm2,%%xmm7 \n" "shufps $0x88,%%xmm6,%%xmm2 \n" "shufps $0xdd,%%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm2,%%xmm6 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm6 \n" "phaddw %%xmm2,%%xmm0 \n" "phaddw %%xmm6,%%xmm1 \n" "psraw $0x8,%%xmm0 \n" "psraw $0x8,%%xmm1 \n" "packsswb %%xmm1,%%xmm0 \n" "paddb %%xmm5,%%xmm0 \n" "movlps %%xmm0,(%1) \n" "movhps %%xmm0,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+rm"(width) // %3 : "r"((intptr_t)(src_stride_rgba)), // %4 "m"(kRGBAToV), // %5 "m"(kRGBAToU), // %6 "m"(kAddUV128) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) // Read 8 UV from 444 #define READYUV444 \ "movq (%[u_buf]),%%xmm3 \n" \ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV #define READYUV422 \ "movd (%[u_buf]),%%xmm3 \n" \ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x4(%[u_buf]),%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422 10 bit, upsample to 8 UV // TODO(fbarchard): Consider shufb to replace pack/unpack // TODO(fbarchard): Consider pmulhuw to replace psraw // TODO(fbarchard): Consider pmullw to replace psllw and allow different bits. #define READYUV210 \ "movq (%[u_buf]),%%xmm3 \n" \ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ "punpcklwd %%xmm1,%%xmm3 \n" \ "psraw $2,%%xmm3 \n" \ "packuswb %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ "psllw $6,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" #define READYUVA210 \ "movq (%[u_buf]),%%xmm3 \n" \ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ "punpcklwd %%xmm1,%%xmm3 \n" \ "psraw $2,%%xmm3 \n" \ "packuswb %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ "psllw $6,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" \ "movdqu (%[a_buf]),%%xmm5 \n" \ "psraw $2,%%xmm5 \n" \ "packuswb %%xmm5,%%xmm5 \n" \ "lea 0x10(%[a_buf]),%[a_buf] \n" // Read 8 UV from 444 10 bit #define READYUV410 \ "movdqu (%[u_buf]),%%xmm3 \n" \ "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ "psraw $2,%%xmm3 \n" \ "psraw $2,%%xmm2 \n" \ "movdqa %%xmm3,%%xmm1 \n" \ "punpcklwd %%xmm2,%%xmm3 \n" \ "punpckhwd %%xmm2,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ "psllw $6,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 UV from 444 10 bit. With 8 Alpha. #define READYUVA410 \ "movdqu (%[u_buf]),%%xmm3 \n" \ "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ "psraw $2,%%xmm3 \n" \ "psraw $2,%%xmm2 \n" \ "movdqa %%xmm3,%%xmm1 \n" \ "punpcklwd %%xmm2,%%xmm3 \n" \ "punpckhwd %%xmm2,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ "psllw $0x6,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" \ "movdqu (%[a_buf]),%%xmm5 \n" \ "psraw $2,%%xmm5 \n" \ "packuswb %%xmm5,%%xmm5 \n" \ "lea 0x10(%[a_buf]),%[a_buf] \n" // Read 4 UV from 422 12 bit, upsample to 8 UV #define READYUV212 \ "movq (%[u_buf]),%%xmm3 \n" \ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ "punpcklwd %%xmm1,%%xmm3 \n" \ "psraw $0x4,%%xmm3 \n" \ "packuswb %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ "psllw $0x4,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ "movd (%[u_buf]),%%xmm3 \n" \ "movd 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x4(%[u_buf]),%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" \ "movq (%[a_buf]),%%xmm5 \n" \ "lea 0x8(%[a_buf]),%[a_buf] \n" // Read 8 UV from 444. With 8 Alpha. #define READYUVA444 \ "movq (%[u_buf]),%%xmm3 \n" \ "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" \ "movq (%[a_buf]),%%xmm5 \n" \ "lea 0x8(%[a_buf]),%[a_buf] \n" // Read 4 UV from NV12, upsample to 8 UV #define READNV12 \ "movq (%[uv_buf]),%%xmm3 \n" \ "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 VU from NV21, upsample to 8 UV #define READNV21 \ "movq (%[vu_buf]),%%xmm3 \n" \ "lea 0x8(%[vu_buf]),%[vu_buf] \n" \ "pshufb %[kShuffleNV21], %%xmm3 \n" \ "movq (%[y_buf]),%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \ "lea 0x8(%[y_buf]),%[y_buf] \n" // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. #define READYUY2 \ "movdqu (%[yuy2_buf]),%%xmm4 \n" \ "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ "movdqu (%[yuy2_buf]),%%xmm3 \n" \ "pshufb %[kShuffleYUY2UV], %%xmm3 \n" \ "lea 0x10(%[yuy2_buf]),%[yuy2_buf] \n" // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. #define READUYVY \ "movdqu (%[uyvy_buf]),%%xmm4 \n" \ "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ "movdqu (%[uyvy_buf]),%%xmm3 \n" \ "pshufb %[kShuffleUYVYUV], %%xmm3 \n" \ "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" // Read 4 UV from P210, upsample to 8 UV #define READP210 \ "movdqu (%[uv_buf]),%%xmm3 \n" \ "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ "psrlw $0x8,%%xmm3 \n" \ "packuswb %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 UV from P410 #define READP410 \ "movdqu (%[uv_buf]),%%xmm3 \n" \ "movdqu 0x10(%[uv_buf]),%%xmm1 \n" \ "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ "psrlw $0x8,%%xmm3 \n" \ "psrlw $0x8,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" #if defined(__x86_64__) #define YUVTORGB_SETUP(yuvconstants) \ "pcmpeqb %%xmm13,%%xmm13 \n" \ "movdqa (%[yuvconstants]),%%xmm8 \n" \ "pxor %%xmm12,%%xmm12 \n" \ "movdqa 32(%[yuvconstants]),%%xmm9 \n" \ "psllw $7,%%xmm13 \n" \ "movdqa 64(%[yuvconstants]),%%xmm10 \n" \ "pshufb %%xmm12,%%xmm13 \n" \ "movdqa 96(%[yuvconstants]),%%xmm11 \n" \ "movdqa 128(%[yuvconstants]),%%xmm12 \n" // Convert 8 pixels: 8 UV and 8 Y #define YUVTORGB16(yuvconstants) \ "psubb %%xmm13,%%xmm3 \n" \ "pmulhuw %%xmm11,%%xmm4 \n" \ "movdqa %%xmm8,%%xmm0 \n" \ "movdqa %%xmm9,%%xmm1 \n" \ "movdqa %%xmm10,%%xmm2 \n" \ "paddw %%xmm12,%%xmm4 \n" \ "pmaddubsw %%xmm3,%%xmm0 \n" \ "pmaddubsw %%xmm3,%%xmm1 \n" \ "pmaddubsw %%xmm3,%%xmm2 \n" \ "paddsw %%xmm4,%%xmm0 \n" \ "paddsw %%xmm4,%%xmm2 \n" \ "psubsw %%xmm1,%%xmm4 \n" \ "movdqa %%xmm4,%%xmm1 \n" #define YUVTORGB_REGS "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", #else #define YUVTORGB_SETUP(yuvconstants) // Convert 8 pixels: 8 UV and 8 Y #define YUVTORGB16(yuvconstants) \ "pcmpeqb %%xmm0,%%xmm0 \n" \ "pxor %%xmm1,%%xmm1 \n" \ "psllw $7,%%xmm0 \n" \ "pshufb %%xmm1,%%xmm0 \n" \ "psubb %%xmm0,%%xmm3 \n" \ "pmulhuw 96(%[yuvconstants]),%%xmm4 \n" \ "movdqa (%[yuvconstants]),%%xmm0 \n" \ "movdqa 32(%[yuvconstants]),%%xmm1 \n" \ "movdqa 64(%[yuvconstants]),%%xmm2 \n" \ "pmaddubsw %%xmm3,%%xmm0 \n" \ "pmaddubsw %%xmm3,%%xmm1 \n" \ "pmaddubsw %%xmm3,%%xmm2 \n" \ "movdqa 128(%[yuvconstants]),%%xmm3 \n" \ "paddw %%xmm3,%%xmm4 \n" \ "paddsw %%xmm4,%%xmm0 \n" \ "paddsw %%xmm4,%%xmm2 \n" \ "psubsw %%xmm1,%%xmm4 \n" \ "movdqa %%xmm4,%%xmm1 \n" #define YUVTORGB_REGS #endif #define YUVTORGB(yuvconstants) \ YUVTORGB16(yuvconstants) \ "psraw $0x6,%%xmm0 \n" \ "psraw $0x6,%%xmm1 \n" \ "psraw $0x6,%%xmm2 \n" \ "packuswb %%xmm0,%%xmm0 \n" \ "packuswb %%xmm1,%%xmm1 \n" \ "packuswb %%xmm2,%%xmm2 \n" // Store 8 ARGB values. #define STOREARGB \ "punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklbw %%xmm5,%%xmm2 \n" \ "movdqa %%xmm0,%%xmm1 \n" \ "punpcklwd %%xmm2,%%xmm0 \n" \ "punpckhwd %%xmm2,%%xmm1 \n" \ "movdqu %%xmm0,(%[dst_argb]) \n" \ "movdqu %%xmm1,0x10(%[dst_argb]) \n" \ "lea 0x20(%[dst_argb]), %[dst_argb] \n" // Store 8 RGBA values. #define STORERGBA \ "pcmpeqb %%xmm5,%%xmm5 \n" \ "punpcklbw %%xmm2,%%xmm1 \n" \ "punpcklbw %%xmm0,%%xmm5 \n" \ "movdqa %%xmm5,%%xmm0 \n" \ "punpcklwd %%xmm1,%%xmm5 \n" \ "punpckhwd %%xmm1,%%xmm0 \n" \ "movdqu %%xmm5,(%[dst_rgba]) \n" \ "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \ "lea 0x20(%[dst_rgba]),%[dst_rgba] \n" // Store 8 AR30 values. #define STOREAR30 \ "psraw $0x4,%%xmm0 \n" \ "psraw $0x4,%%xmm1 \n" \ "psraw $0x4,%%xmm2 \n" \ "pminsw %%xmm7,%%xmm0 \n" \ "pminsw %%xmm7,%%xmm1 \n" \ "pminsw %%xmm7,%%xmm2 \n" \ "pmaxsw %%xmm6,%%xmm0 \n" \ "pmaxsw %%xmm6,%%xmm1 \n" \ "pmaxsw %%xmm6,%%xmm2 \n" \ "psllw $0x4,%%xmm2 \n" \ "movdqa %%xmm0,%%xmm3 \n" \ "punpcklwd %%xmm2,%%xmm0 \n" \ "punpckhwd %%xmm2,%%xmm3 \n" \ "movdqa %%xmm1,%%xmm2 \n" \ "punpcklwd %%xmm5,%%xmm1 \n" \ "punpckhwd %%xmm5,%%xmm2 \n" \ "pslld $0xa,%%xmm1 \n" \ "pslld $0xa,%%xmm2 \n" \ "por %%xmm1,%%xmm0 \n" \ "por %%xmm2,%%xmm3 \n" \ "movdqu %%xmm0,(%[dst_ar30]) \n" \ "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \ "lea 0x20(%[dst_ar30]), %[dst_ar30] \n" void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUV444 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #ifdef HAS_I444ALPHATOARGBROW_SSSE3 void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA444 YUVTORGB(yuvconstants) STOREARGB "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] #if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] #endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } #endif // HAS_I444ALPHATOARGBROW_SSSE3 void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUV422 YUVTORGB(yuvconstants) "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm2,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" "movq %%xmm0,(%[dst_rgb24]) \n" "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] #if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] #endif : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUV422 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits "pxor %%xmm6,%%xmm6 \n" // 0 for min "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" READYUV422 YUVTORGB16(yuvconstants) STOREAR30 "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } // 10 bit YUV to ARGB void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUV210 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } // 12 bit YUV to ARGB void OMITFP I212ToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUV212 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } // 10 bit YUV to AR30 void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits "pxor %%xmm6,%%xmm6 \n" // 0 for min "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" READYUV210 YUVTORGB16(yuvconstants) STOREAR30 "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } // 12 bit YUV to AR30 void OMITFP I212ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits "pxor %%xmm6,%%xmm6 \n" // 0 for min "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" READYUV212 YUVTORGB16(yuvconstants) STOREAR30 "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } // 10 bit YUV to ARGB void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUV410 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #ifdef HAS_I210ALPHATOARGBROW_SSSE3 // 10 bit YUVA to ARGB void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA210 YUVTORGB(yuvconstants) STOREARGB "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] [a_buf] "+r"(a_buf), [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) [width] "+m"(width) // %[width] #else [width] "+rm"(width) // %[width] #endif : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif #ifdef HAS_I410ALPHATOARGBROW_SSSE3 // 10 bit YUVA to ARGB void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA410 YUVTORGB(yuvconstants) STOREARGB "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] [a_buf] "+r"(a_buf), [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) [width] "+m"(width) // %[width] #else [width] "+rm"(width) // %[width] #endif : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); // clang-format on } #endif // 10 bit YUV to AR30 void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits "pxor %%xmm6,%%xmm6 \n" // 0 for min "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" READYUV410 YUVTORGB16(yuvconstants) STOREAR30 "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #ifdef HAS_I422ALPHATOARGBROW_SSSE3 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA422 YUVTORGB(yuvconstants) STOREARGB "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] #if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] #endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } #endif // HAS_I422ALPHATOARGBROW_SSSE3 void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READNV12 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* vu_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READNV21 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [vu_buf]"+r"(vu_buf), // %[vu_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleNV21]"m"(kShuffleNV21) : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleYUY2Y]"m"(kShuffleYUY2Y), [kShuffleYUY2UV]"m"(kShuffleYUY2UV) : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleUYVYY]"m"(kShuffleUYVYY), [kShuffleUYVYUV]"m"(kShuffleUYVYUV) : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READP210 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[u_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+rm"(width) // %[width] : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP( yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READP410 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[u_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+rm"(width) // %[width] : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits "pxor %%xmm6,%%xmm6 \n" // 0 for min "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" READP210 YUVTORGB16(yuvconstants) STOREAR30 "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $14,%%xmm5 \n" "psllw $4,%%xmm5 \n" // 2 alpha bits "pxor %%xmm6,%%xmm6 \n" // 0 for min "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $6,%%xmm7 \n" // 1023 for max LABELALIGN "1: \n" READP410 YUVTORGB16(yuvconstants) STOREAR30 "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP(yuvconstants) "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUV422 YUVTORGB(yuvconstants) STORERGBA "sub $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TOARGBROW_SSSE3 // Read 16 UV from 444 #define READYUV444_AVX2 \ "vmovdqu (%[u_buf]),%%xmm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 UV from 422, upsample to 16 UV. #define READYUV422_AVX2 \ "vmovq (%[u_buf]),%%xmm3 \n" \ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 UV from 210, upsample to 16 UV // TODO(fbarchard): Consider vshufb to replace pack/unpack // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. #define READYUV210_AVX2 \ "vmovdqu (%[u_buf]),%%xmm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ "vpsraw $2,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ "vpsllw $6,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha. #define READYUVA210_AVX2 \ "vmovdqu (%[u_buf]),%%xmm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ "vpsraw $2,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ "vpsllw $6,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" \ "vmovdqu (%[a_buf]),%%ymm5 \n" \ "vpsraw $2,%%ymm5,%%ymm5 \n" \ "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ "lea 0x20(%[a_buf]),%[a_buf] \n" // Read 16 UV from 410 #define READYUV410_AVX2 \ "vmovdqu (%[u_buf]),%%ymm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ "lea 0x20(%[u_buf]),%[u_buf] \n" \ "vpsraw $2,%%ymm3,%%ymm3 \n" \ "vpsraw $2,%%ymm2,%%ymm2 \n" \ "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \ "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ "vpsllw $6,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 8 UV from 212 12 bit, upsample to 16 UV #define READYUV212_AVX2 \ "vmovdqu (%[u_buf]),%%xmm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ "vpunpcklwd %%ymm1,%%ymm3,%%ymm3 \n" \ "vpsraw $0x4,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ "vpsllw $0x4,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 16 UV from 410. With 16 Alpha. #define READYUVA410_AVX2 \ "vmovdqu (%[u_buf]),%%ymm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ "lea 0x20(%[u_buf]),%[u_buf] \n" \ "vpsraw $2,%%ymm3,%%ymm3 \n" \ "vpsraw $2,%%ymm2,%%ymm2 \n" \ "vpunpckhwd %%ymm2,%%ymm3,%%ymm1 \n" \ "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ "vpsllw $6,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" \ "vmovdqu (%[a_buf]),%%ymm5 \n" \ "vpsraw $2,%%ymm5,%%ymm5 \n" \ "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ "lea 0x20(%[a_buf]),%[a_buf] \n" // Read 16 UV from 444. With 16 Alpha. #define READYUVA444_AVX2 \ "vmovdqu (%[u_buf]),%%xmm3 \n" \ "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x10(%[u_buf]),%[u_buf] \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" \ "vmovdqu (%[a_buf]),%%xmm5 \n" \ "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ "lea 0x10(%[a_buf]),%[a_buf] \n" // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. #define READYUVA422_AVX2 \ "vmovq (%[u_buf]),%%xmm3 \n" \ "vmovq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ "vpunpcklbw %%ymm1,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" \ "vmovdqu (%[a_buf]),%%xmm5 \n" \ "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ "lea 0x10(%[a_buf]),%[a_buf] \n" // Read 8 UV from NV12, upsample to 16 UV. #define READNV12_AVX2 \ "vmovdqu (%[uv_buf]),%%xmm3 \n" \ "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 VU from NV21, upsample to 16 UV. #define READNV21_AVX2 \ "vmovdqu (%[vu_buf]),%%xmm3 \n" \ "lea 0x10(%[vu_buf]),%[vu_buf] \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vpshufb %[kShuffleNV21], %%ymm3, %%ymm3 \n" \ "vmovdqu (%[y_buf]),%%xmm4 \n" \ "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 4 UV from P210, upsample to 8 UV #define READP210_AVX2 \ "vmovdqu (%[uv_buf]),%%ymm3 \n" \ "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 8 UV from P410 #define READP410_AVX2 \ "vmovdqu (%[uv_buf]),%%ymm3 \n" \ "vmovdqu 0x20(%[uv_buf]),%%ymm1 \n" \ "lea 0x40(%[uv_buf]),%[uv_buf] \n" \ "vpsrlw $0x8,%%ymm3,%%ymm3 \n" \ "vpsrlw $0x8,%%ymm1,%%ymm1 \n" \ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. #define READYUY2_AVX2 \ "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \ "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \ "vmovdqu (%[yuy2_buf]),%%ymm3 \n" \ "vpshufb %[kShuffleYUY2UV], %%ymm3, %%ymm3 \n" \ "lea 0x20(%[yuy2_buf]),%[yuy2_buf] \n" // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. #define READUYVY_AVX2 \ "vmovdqu (%[uyvy_buf]),%%ymm4 \n" \ "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \ "vmovdqu (%[uyvy_buf]),%%ymm3 \n" \ "vpshufb %[kShuffleUYVYUV], %%ymm3, %%ymm3 \n" \ "lea 0x20(%[uyvy_buf]),%[uyvy_buf] \n" #if defined(__x86_64__) #define YUVTORGB_SETUP_AVX2(yuvconstants) \ "vpcmpeqb %%xmm13,%%xmm13,%%xmm13 \n" \ "vmovdqa (%[yuvconstants]),%%ymm8 \n" \ "vpsllw $7,%%xmm13,%%xmm13 \n" \ "vmovdqa 32(%[yuvconstants]),%%ymm9 \n" \ "vpbroadcastb %%xmm13,%%ymm13 \n" \ "vmovdqa 64(%[yuvconstants]),%%ymm10 \n" \ "vmovdqa 96(%[yuvconstants]),%%ymm11 \n" \ "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" #define YUVTORGB16_AVX2(yuvconstants) \ "vpsubb %%ymm13,%%ymm3,%%ymm3 \n" \ "vpmulhuw %%ymm11,%%ymm4,%%ymm4 \n" \ "vpmaddubsw %%ymm3,%%ymm8,%%ymm0 \n" \ "vpmaddubsw %%ymm3,%%ymm9,%%ymm1 \n" \ "vpmaddubsw %%ymm3,%%ymm10,%%ymm2 \n" \ "vpaddw %%ymm4,%%ymm12,%%ymm4 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" #define YUVTORGB_REGS_AVX2 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", #else // Convert 16 pixels: 16 UV and 16 Y. #define YUVTORGB_SETUP_AVX2(yuvconstants) #define YUVTORGB16_AVX2(yuvconstants) \ "vpcmpeqb %%xmm0,%%xmm0,%%xmm0 \n" \ "vpsllw $7,%%xmm0,%%xmm0 \n" \ "vpbroadcastb %%xmm0,%%ymm0 \n" \ "vpsubb %%ymm0,%%ymm3,%%ymm3 \n" \ "vpmulhuw 96(%[yuvconstants]),%%ymm4,%%ymm4 \n" \ "vmovdqa (%[yuvconstants]),%%ymm0 \n" \ "vmovdqa 32(%[yuvconstants]),%%ymm1 \n" \ "vmovdqa 64(%[yuvconstants]),%%ymm2 \n" \ "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" \ "vpmaddubsw %%ymm3,%%ymm1,%%ymm1 \n" \ "vpmaddubsw %%ymm3,%%ymm2,%%ymm2 \n" \ "vmovdqa 128(%[yuvconstants]),%%ymm3 \n" \ "vpaddw %%ymm4,%%ymm3,%%ymm4 \n" \ "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \ "vpsubsw %%ymm1,%%ymm4,%%ymm1 \n" \ "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" #define YUVTORGB_REGS_AVX2 #endif #define YUVTORGB_AVX2(yuvconstants) \ YUVTORGB16_AVX2(yuvconstants) \ "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" // Store 16 ARGB values. #define STOREARGB_AVX2 \ "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \ "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \ "vmovdqu %%ymm1,(%[dst_argb]) \n" \ "vmovdqu %%ymm0,0x20(%[dst_argb]) \n" \ "lea 0x40(%[dst_argb]), %[dst_argb] \n" // Store 16 AR30 values. #define STOREAR30_AVX2 \ "vpsraw $0x4,%%ymm0,%%ymm0 \n" \ "vpsraw $0x4,%%ymm1,%%ymm1 \n" \ "vpsraw $0x4,%%ymm2,%%ymm2 \n" \ "vpminsw %%ymm7,%%ymm0,%%ymm0 \n" \ "vpminsw %%ymm7,%%ymm1,%%ymm1 \n" \ "vpminsw %%ymm7,%%ymm2,%%ymm2 \n" \ "vpmaxsw %%ymm6,%%ymm0,%%ymm0 \n" \ "vpmaxsw %%ymm6,%%ymm1,%%ymm1 \n" \ "vpmaxsw %%ymm6,%%ymm2,%%ymm2 \n" \ "vpsllw $0x4,%%ymm2,%%ymm2 \n" \ "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ "vpermq $0xd8,%%ymm2,%%ymm2 \n" \ "vpunpckhwd %%ymm2,%%ymm0,%%ymm3 \n" \ "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \ "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" \ "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" \ "vpslld $0xa,%%ymm1,%%ymm1 \n" \ "vpslld $0xa,%%ymm2,%%ymm2 \n" \ "vpor %%ymm1,%%ymm0,%%ymm0 \n" \ "vpor %%ymm2,%%ymm3,%%ymm3 \n" \ "vmovdqu %%ymm0,(%[dst_ar30]) \n" \ "vmovdqu %%ymm3,0x20(%[dst_ar30]) \n" \ "lea 0x40(%[dst_ar30]), %[dst_ar30] \n" #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUV444_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I444TOARGBROW_AVX2 #if defined(HAS_I422TOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TOARGBROW_AVX2 #if defined(HAS_I422TOAR30ROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" READYUV422_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_I422TOAR30ROW_AVX2 #if defined(HAS_I210TOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUV210_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I210TOARGBROW_AVX2 #if defined(HAS_I212TOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). void OMITFP I212ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUV212_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I212TOARGBROW_AVX2 #if defined(HAS_I210TOAR30ROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" READYUV210_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_I210TOAR30ROW_AVX2 #if defined(HAS_I212TOAR30ROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes). void OMITFP I212ToAR30Row_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" READYUV212_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_I212TOAR30ROW_AVX2 #if defined(HAS_I410TOARGBROW_AVX2) // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUV410_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I410TOARGBROW_AVX2 #if defined(HAS_I210ALPHATOARGBROW_AVX2) // 16 pixels // 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA210_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "subl $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] [a_buf] "+r"(a_buf), // %[a_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) [width] "+m"(width) // %[width] #else [width] "+rm"(width) // %[width] #endif : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_I210TOARGBROW_AVX2 #if defined(HAS_I410ALPHATOARGBROW_AVX2) // 16 pixels // 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, const uint16_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP_AVX2( yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA410_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "subl $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] [a_buf] "+r"(a_buf), // %[a_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] #if defined(__i386__) [width] "+m"(width) // %[width] #else [width] "+rm"(width) // %[width] #endif : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_I410TOARGBROW_AVX2 #if defined(HAS_I410TOAR30ROW_AVX2) // 16 pixels // 16 UV values with 16 Y producing 16 AR30 (64 bytes). void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf, const uint16_t* u_buf, const uint16_t* v_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" READYUV410_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_I410TOAR30ROW_AVX2 #if defined(HAS_I444ALPHATOARGBROW_AVX2) // 16 pixels // 16 UV values with 16 Y and 16 A producing 16 ARGB. void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA444_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "subl $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] #if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] #endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } #endif // HAS_I444ALPHATOARGBROW_AVX2 #if defined(HAS_I422ALPHATOARGBROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" LABELALIGN "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "subl $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [a_buf]"+r"(a_buf), // %[a_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] #if defined(__i386__) [width]"+m"(width) // %[width] #else [width]"+rm"(width) // %[width] #endif : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } #endif // HAS_I422ALPHATOARGBROW_AVX2 #if defined(HAS_I422TORGBAROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUV422_AVX2 YUVTORGB_AVX2(yuvconstants) // Step 3: Weave into RGBA "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" "vpermq $0xd8,%%ymm1,%%ymm1 \n" "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n" "vpermq $0xd8,%%ymm2,%%ymm2 \n" "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n" "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" "vmovdqu %%ymm0,(%[dst_argb]) \n" "vmovdqu %%ymm1,0x20(%[dst_argb]) \n" "lea 0x40(%[dst_argb]),%[dst_argb] \n" "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [u_buf]"+r"(u_buf), // %[u_buf] [v_buf]"+r"(v_buf), // %[v_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TORGBAROW_AVX2 #if defined(HAS_NV12TOARGBROW_AVX2) // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READNV12_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } #endif // HAS_NV12TOARGBROW_AVX2 #if defined(HAS_NV21TOARGBROW_AVX2) // 16 pixels. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, const uint8_t* vu_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READNV21_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [vu_buf]"+r"(vu_buf), // %[vu_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleNV21]"m"(kShuffleNV21) : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } #endif // HAS_NV21TOARGBROW_AVX2 #if defined(HAS_YUY2TOARGBROW_AVX2) // 16 pixels. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleYUY2Y]"m"(kShuffleYUY2Y), [kShuffleYUY2UV]"m"(kShuffleYUY2UV) : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } #endif // HAS_YUY2TOARGBROW_AVX2 #if defined(HAS_UYVYTOARGBROW_AVX2) // 16 pixels. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] [kShuffleUYVYY]"m"(kShuffleUYVYY), [kShuffleUYVYUV]"m"(kShuffleUYVYUV) : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } #endif // HAS_UYVYTOARGBROW_AVX2 #if defined(HAS_P210TOARGBROW_AVX2) // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READP210_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } #endif // HAS_P210TOARGBROW_AVX2 #if defined(HAS_P410TOARGBROW_AVX2) // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { // clang-format off asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READP410_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_argb]"+r"(dst_argb), // %[dst_argb] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); // clang-format on } #endif // HAS_P410TOARGBROW_AVX2 #if defined(HAS_P210TOAR30ROW_AVX2) // 16 pixels // 16 UV values with 16 Y producing 16 AR30 (64 bytes). void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" READP210_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_P210TOAR30ROW_AVX2 #if defined(HAS_P410TOAR30ROW_AVX2) // 16 pixels // 16 UV values with 16 Y producing 16 AR30 (64 bytes). void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width) { asm volatile ( YUVTORGB_SETUP_AVX2(yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max "vpsrlw $6,%%ymm7,%%ymm7 \n" LABELALIGN "1: \n" READP410_AVX2 YUVTORGB16_AVX2(yuvconstants) STOREAR30_AVX2 "sub $0x10,%[width] \n" "jg 1b \n" "vzeroupper \n" : [y_buf]"+r"(y_buf), // %[y_buf] [uv_buf]"+r"(uv_buf), // %[uv_buf] [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); } #endif // HAS_P410TOAR30ROW_AVX2 #ifdef HAS_I400TOARGBROW_SSE2 void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( "movdqa 96(%3),%%xmm2 \n" // yg = 18997 = 1.164 "movdqa 128(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 "pslld $0x18,%%xmm4 \n" LABELALIGN "1: \n" // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 "movq (%0),%%xmm0 \n" "lea 0x8(%0),%0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" "paddsw %%xmm3,%%xmm0 \n" "psraw $6, %%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" // Step 2: Weave into ARGB "punpcklbw %%xmm0,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "por %%xmm4,%%xmm1 \n" "movdqu %%xmm0,(%1) \n" "movdqu %%xmm1,0x10(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(y_buf), // %0 "+r"(dst_argb), // %1 "+rm"(width) // %2 : "r"(yuvconstants) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( "vmovdqa 96(%3),%%ymm2 \n" // yg = 18997 = 1.164 "vmovdqa 128(%3),%%ymm3 \n" // ygb = -1160 = 1.164*16 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 "vpslld $0x18,%%ymm4,%%ymm4 \n" LABELALIGN "1: \n" // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 "vmovdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" "vpsraw $0x6,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" "vpermq $0xd8,%%ymm1,%%ymm1 \n" "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n" "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n" "vpor %%ymm4,%%ymm0,%%ymm0 \n" "vpor %%ymm4,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(y_buf), // %0 "+r"(dst_argb), // %1 "+rm"(width) // %2 : "r"(yuvconstants) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_AVX2 #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( "movdqa %3,%%xmm5 \n" LABELALIGN "1: \n" "movdqu -0x10(%0,%2,1),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 : "m"(kShuffleMirror) // %3 : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( "vbroadcastf128 %3,%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpermq $0x4e,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 : "m"(kShuffleMirror) // %3 : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORUVROW_SSSE3 // Shuffle table for reversing the UV. static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( "movdqa %3,%%xmm5 \n" LABELALIGN "1: \n" "movdqu -0x10(%0,%2,2),%%xmm0 \n" "pshufb %%xmm5,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(temp_width) // %2 : "m"(kShuffleMirrorUV) // %3 : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORUVROW_SSSE3 #ifdef HAS_MIRRORUVROW_AVX2 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( "vbroadcastf128 %3,%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpermq $0x4e,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(temp_width) // %2 : "m"(kShuffleMirrorUV) // %3 : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_MIRRORUVROW_AVX2 #ifdef HAS_MIRRORSPLITUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; void MirrorSplitUVRow_SSSE3(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( "movdqa %4,%%xmm1 \n" "lea -0x10(%0,%3,2),%0 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "lea -0x10(%0),%0 \n" "pshufb %%xmm1,%%xmm0 \n" "movlpd %%xmm0,(%1) \n" "movhpd %%xmm0,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" "sub $8,%3 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(temp_width) // %3 : "m"(kShuffleMirrorSplitUV) // %4 : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_MIRRORSPLITUVROW_SSSE3 #ifdef HAS_RGB24MIRRORROW_SSSE3 // Shuffle first 5 pixels to last 5 mirrored. first byte zero static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u}; // Shuffle last 5 pixels to first 5 mirrored. last byte zero static const uvec8 kShuffleMirrorRGB1 = { 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u}; // Shuffle 5 pixels at a time (15 bytes) void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { intptr_t temp_width = (intptr_t)(width); src_rgb24 += width * 3 - 48; asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" // first 5 "movdqu 15(%0),%%xmm1 \n" // next 5 "movdqu 30(%0),%%xmm2 \n" // next 5 "movdqu 32(%0),%%xmm3 \n" // last 1 special "pshufb %%xmm4,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" "pshufb %%xmm4,%%xmm2 \n" "pshufb %%xmm5,%%xmm3 \n" "lea -0x30(%0),%0 \n" "movdqu %%xmm0,32(%1) \n" // last 5 "movdqu %%xmm1,17(%1) \n" // next 5 "movdqu %%xmm2,2(%1) \n" // next 5 "movlpd %%xmm3,0(%1) \n" // first 1 "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_rgb24), // %1 "+r"(temp_width) // %2 : "m"(kShuffleMirrorRGB0), // %3 "m"(kShuffleMirrorRGB1) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_RGB24MIRRORROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( "lea -0x10(%0,%2,4),%0 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "pshufd $0x1b,%%xmm0,%%xmm0 \n" "lea -0x10(%0),%0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 : : "memory", "cc", "xmm0"); } #endif // HAS_ARGBMIRRORROW_SSE2 #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { intptr_t temp_width = (intptr_t)(width); asm volatile( "vmovdqu %3,%%ymm5 \n" LABELALIGN "1: \n" "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 : "m"(kARGBShuffleMirror_AVX2) // %3 : "memory", "cc", "xmm0", "xmm5"); } #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_AVX2 void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpsrlw $0x8,%%ymm0,%%ymm2 \n" "vpsrlw $0x8,%%ymm1,%%ymm3 \n" "vpand %%ymm5,%%ymm0,%%ymm0 \n" "vpand %%ymm5,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm2,%%ymm2 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm2,0x00(%1,%2,1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 void SplitUVRow_SSE2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm1,%%xmm3 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "psrlw $0x8,%%xmm2 \n" "psrlw $0x8,%%xmm3 \n" "packuswb %%xmm3,%%xmm2 \n" "movdqu %%xmm0,(%1) \n" "movdqu %%xmm2,0x00(%1,%2,1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { asm volatile( "sub %0,%1 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x00(%0,%1,1),%%ymm1 \n" "lea 0x20(%0),%0 \n" "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n" "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm2,(%2) \n" "vextractf128 $0x0,%%ymm0,0x10(%2) \n" "vextractf128 $0x1,%%ymm2,0x20(%2) \n" "vextractf128 $0x1,%%ymm0,0x30(%2) \n" "lea 0x40(%2),%2 \n" "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_AVX2 #ifdef HAS_MERGEUVROW_SSE2 void MergeUVRow_SSE2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { asm volatile( "sub %0,%1 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" "lea 0x10(%0),%0 \n" "movdqa %%xmm0,%%xmm2 \n" "punpcklbw %%xmm1,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm2 \n" "movdqu %%xmm0,(%2) \n" "movdqu %%xmm2,0x10(%2) \n" "lea 0x20(%2),%2 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_16_AVX2 void MergeUVRow_16_AVX2(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, int depth, int width) { depth = 16 - depth; // clang-format off asm volatile ( "vmovd %4,%%xmm3 \n" "sub %0,%1 \n" // 16 pixels per loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0,%1,1),%%ymm1 \n" "add $0x20,%0 \n" "vpsllw %%xmm3,%%ymm0,%%ymm0 \n" "vpsllw %%xmm3,%%ymm1,%%ymm1 \n" "vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates "vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm2,(%2) \n" "vextractf128 $0x0,%%ymm0,0x10(%2) \n" "vextractf128 $0x1,%%ymm2,0x20(%2) \n" "vextractf128 $0x1,%%ymm0,0x30(%2) \n" "add $0x40,%2 \n" "sub $0x10,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 : "r"(depth) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); // clang-format on } #endif // HAS_MERGEUVROW_AVX2 #ifdef HAS_SPLITUVROW_16_AVX2 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}; void SplitUVRow_16_AVX2(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width) { depth = 16 - depth; // clang-format off asm volatile ( "vmovd %4,%%xmm3 \n" "vbroadcastf128 %5,%%ymm4 \n" "sub %1,%2 \n" // 16 pixels per loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "add $0x40,%0 \n" "vpsrlw %%xmm3,%%ymm0,%%ymm0 \n" "vpsrlw %%xmm3,%%ymm1,%%ymm1 \n" "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm1,%%ymm1 \n" "vextractf128 $0x0,%%ymm0,(%1) \n" "vextractf128 $0x0,%%ymm1,0x10(%1) \n" "vextractf128 $0x1,%%ymm0,(%1,%2) \n" "vextractf128 $0x1,%%ymm1,0x10(%1,%2) \n" "add $0x20,%1 \n" "sub $0x10,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : "r"(depth), // %4 "m"(kSplitUVShuffle16) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); // clang-format on } #endif // HAS_SPLITUVROW_16_AVX2 // Use scale to convert lsb formats to msb, depending how many bits there are: // 128 = 9 bits // 64 = 10 bits // 16 = 12 bits // 1 = 16 bits #ifdef HAS_MULTIPLYROW_16_AVX2 void MultiplyRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { // clang-format off asm volatile ( "vmovd %3,%%xmm3 \n" "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" "vbroadcastss %%xmm3,%%ymm3 \n" "sub %0,%1 \n" // 32 pixels per loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpmullw %%ymm3,%%ymm0,%%ymm0 \n" "vpmullw %%ymm3,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm0,(%0,%1) \n" "vmovdqu %%ymm1,0x20(%0,%1) \n" "add $0x40,%0 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "r"(scale) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm3"); // clang-format on } #endif // HAS_MULTIPLYROW_16_AVX2 // Use scale to convert msb formats to lsb, depending how many bits there are: // 512 = 9 bits // 1024 = 10 bits // 4096 = 12 bits // 65536 = 16 bits #ifdef HAS_DIVIDEROW_16_AVX2 void DivideRow_16_AVX2(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { // clang-format off asm volatile ( "vmovd %3,%%xmm3 \n" "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" "vbroadcastss %%xmm3,%%ymm3 \n" "sub %0,%1 \n" // 32 pixels per loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpmulhuw %%ymm3,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm0,(%0,%1) \n" "vmovdqu %%ymm1,0x20(%0,%1) \n" "add $0x40,%0 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width), // %2 "+r"(scale) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm3"); // clang-format on } #endif // HAS_MULTIPLYROW_16_AVX2 // Use scale to convert lsb formats to msb, depending how many bits there are: // 32768 = 9 bits // 16384 = 10 bits // 4096 = 12 bits // 256 = 16 bits void Convert16To8Row_SSSE3(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { // clang-format off asm volatile ( "movd %3,%%xmm2 \n" "punpcklwd %%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" // 32 pixels per loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "add $0x20,%0 \n" "pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "add $0x10,%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "r"(scale) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2"); // clang-format on } #ifdef HAS_CONVERT16TO8ROW_AVX2 void Convert16To8Row_AVX2(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { // clang-format off asm volatile ( "vmovd %3,%%xmm2 \n" "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" "vbroadcastss %%xmm2,%%ymm2 \n" // 32 pixels per loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "add $0x40,%0 \n" "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" // mutates "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "add $0x20,%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "r"(scale) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2"); // clang-format on } #endif // HAS_CONVERT16TO8ROW_AVX2 // Use scale to convert to lsb formats depending how many bits there are: // 512 = 9 bits // 1024 = 10 bits // 4096 = 12 bits // TODO(fbarchard): reduce to SSE2 void Convert8To16Row_SSE2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) { // clang-format off asm volatile ( "movd %3,%%xmm2 \n" "punpcklwd %%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" // 32 pixels per loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm1 \n" "add $0x10,%0 \n" "pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm1 \n" "movdqu %%xmm0,(%1) \n" "movdqu %%xmm1,0x10(%1) \n" "add $0x20,%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "r"(scale) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2"); // clang-format on } #ifdef HAS_CONVERT8TO16ROW_AVX2 void Convert8To16Row_AVX2(const uint8_t* src_y, uint16_t* dst_y, int scale, int width) { // clang-format off asm volatile ( "vmovd %3,%%xmm2 \n" "vpunpcklwd %%xmm2,%%xmm2,%%xmm2 \n" "vbroadcastss %%xmm2,%%ymm2 \n" // 32 pixels per loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "add $0x20,%0 \n" "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm2,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" "add $0x40,%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "r"(scale) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2"); // clang-format on } #endif // HAS_CONVERT8TO16ROW_AVX2 #ifdef HAS_SPLITRGBROW_SSSE3 // Shuffle table for converting RGB to Planar. static const uvec8 kShuffleMaskRGBToR0 = {0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u}; static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u}; static const uvec8 kShuffleMaskRGBToG0 = {1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u, 128u, 128u, 128u, 128u, 128u}; static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 2u, 5u, 8u, 11u, 14u}; static const uvec8 kShuffleMaskRGBToB0 = {2u, 5u, 8u, 11u, 14u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u, 4u, 7u, 10u, 13u, 128u, 128u, 128u, 128u, 128u, 128u}; static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 0u, 3u, 6u, 9u, 12u, 15u}; void SplitRGBRow_SSSE3(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { asm volatile( LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "pshufb %5, %%xmm0 \n" "pshufb %6, %%xmm1 \n" "pshufb %7, %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "pshufb %8, %%xmm0 \n" "pshufb %9, %%xmm1 \n" "pshufb %10, %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" "lea 0x10(%2),%2 \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "pshufb %11, %%xmm0 \n" "pshufb %12, %%xmm1 \n" "pshufb %13, %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%3) \n" "lea 0x10(%3),%3 \n" "lea 0x30(%0),%0 \n" "sub $0x10,%4 \n" "jg 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(width) // %4 : "m"(kShuffleMaskRGBToR0), // %5 "m"(kShuffleMaskRGBToR1), // %6 "m"(kShuffleMaskRGBToR2), // %7 "m"(kShuffleMaskRGBToG0), // %8 "m"(kShuffleMaskRGBToG1), // %9 "m"(kShuffleMaskRGBToG2), // %10 "m"(kShuffleMaskRGBToB0), // %11 "m"(kShuffleMaskRGBToB1), // %12 "m"(kShuffleMaskRGBToB2) // %13 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_SPLITRGBROW_SSSE3 #ifdef HAS_MERGERGBROW_SSSE3 // Shuffle table for converting RGB to Planar. static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u, 128u, 5u}; static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u, 128u}; static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u, 128u, 128u, 2u, 128u, 128u, 3u, 128u, 128u, 4u, 128u}; static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u, 10u}; static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u}; static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u, 128u, 128u, 7u, 128u, 128u, 8u, 128u, 128u, 9u, 128u, 128u, 10u, 128u}; static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u, 15u}; static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u, 15u, 128u, 128u}; static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u, 128u, 128u, 13u, 128u, 128u, 14u, 128u, 128u, 15u, 128u}; void MergeRGBRow_SSSE3(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width) { asm volatile( LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu (%1),%%xmm1 \n" "movdqu (%2),%%xmm2 \n" "pshufb %5, %%xmm0 \n" "pshufb %6, %%xmm1 \n" "pshufb %7, %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%3) \n" "movdqu (%0),%%xmm0 \n" "movdqu (%1),%%xmm1 \n" "movdqu (%2),%%xmm2 \n" "pshufb %8, %%xmm0 \n" "pshufb %9, %%xmm1 \n" "pshufb %10, %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,16(%3) \n" "movdqu (%0),%%xmm0 \n" "movdqu (%1),%%xmm1 \n" "movdqu (%2),%%xmm2 \n" "pshufb %11, %%xmm0 \n" "pshufb %12, %%xmm1 \n" "pshufb %13, %%xmm2 \n" "por %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,32(%3) \n" "lea 0x10(%0),%0 \n" "lea 0x10(%1),%1 \n" "lea 0x10(%2),%2 \n" "lea 0x30(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_rgb), // %3 "+r"(width) // %4 : "m"(kShuffleMaskRToRGB0), // %5 "m"(kShuffleMaskGToRGB0), // %6 "m"(kShuffleMaskBToRGB0), // %7 "m"(kShuffleMaskRToRGB1), // %8 "m"(kShuffleMaskGToRGB1), // %9 "m"(kShuffleMaskBToRGB1), // %10 "m"(kShuffleMaskRToRGB2), // %11 "m"(kShuffleMaskGToRGB2), // %12 "m"(kShuffleMaskBToRGB2) // %13 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGERGBROW_SSSE3 #ifdef HAS_MERGEARGBROW_SSE2 void MergeARGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, uint8_t* dst_argb, int width) { asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" LABELALIGN "1: \n" "movq (%0,%2),%%xmm0 \n" // B "movq (%0),%%xmm1 \n" // R "movq (%0,%1),%%xmm2 \n" // G "punpcklbw %%xmm1,%%xmm0 \n" // BR "movq (%0,%3),%%xmm1 \n" // A "punpcklbw %%xmm1,%%xmm2 \n" // GA "movdqa %%xmm0,%%xmm1 \n" // BR "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) "movdqu %%xmm0,(%4) \n" "movdqu %%xmm1,16(%4) \n" "lea 8(%0),%0 \n" "lea 32(%4),%4 \n" "sub $0x8,%5 \n" "jg 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(src_a), // %3 "+r"(dst_argb), // %4 "+r"(width) // %5 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif #ifdef HAS_MERGEXRGBROW_SSE2 void MergeXRGBRow_SSE2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_argb, int width) { asm volatile( LABELALIGN "1: \n" "movq (%2),%%xmm0 \n" // B "movq (%0),%%xmm1 \n" // R "movq (%1),%%xmm2 \n" // G "punpcklbw %%xmm1,%%xmm0 \n" // BR "pcmpeqd %%xmm1,%%xmm1 \n" // A(255) "punpcklbw %%xmm1,%%xmm2 \n" // GA "movdqa %%xmm0,%%xmm1 \n" // BR "punpckhbw %%xmm2,%%xmm1 \n" // BGRA (hi) "punpcklbw %%xmm2,%%xmm0 \n" // BGRA (lo) "movdqu %%xmm0,(%3) \n" "movdqu %%xmm1,16(%3) \n" "lea 8(%0),%0 \n" "lea 8(%1),%1 \n" "lea 8(%2),%2 \n" "lea 32(%3),%3 \n" "sub $0x8,%4 \n" "jg 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEARGBROW_SSE2 #ifdef HAS_MERGEARGBROW_AVX2 void MergeARGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, uint8_t* dst_argb, int width) { asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" LABELALIGN "1: \n" "vmovdqu (%0,%2),%%xmm0 \n" // B "vmovdqu (%0,%1),%%xmm1 \n" // R "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // A "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n" "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%4) \n" // First 8 "vmovdqu %%ymm1,32(%4) \n" // Next 8 "lea 16(%0),%0 \n" "lea 64(%4),%4 \n" "sub $0x10,%5 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(src_a), // %3 "+r"(dst_argb), // %4 "+r"(width) // %5 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif #ifdef HAS_MERGEXRGBROW_AVX2 void MergeXRGBRow_AVX2(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_argb, int width) { asm volatile( LABELALIGN "1: \n" "vmovdqu (%2),%%xmm0 \n" // B "vpcmpeqd %%ymm1,%%ymm1,%%ymm1 \n" // A(255) "vinserti128 $0,(%1),%%ymm1,%%ymm1 \n" // R "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // G "vpunpckhbw %%ymm1,%%ymm0,%%ymm2 \n" "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" "vperm2i128 $0x31,%%ymm2,%%ymm0,%%ymm1 \n" "vperm2i128 $0x20,%%ymm2,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%3) \n" // First 8 "vmovdqu %%ymm1,32(%3) \n" // Next 8 "lea 16(%0),%0 \n" "lea 16(%1),%1 \n" "lea 16(%2),%2 \n" "lea 64(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_argb), // %3 "+rm"(width) // %4 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_MERGEARGBROW_AVX2 #ifdef HAS_SPLITARGBROW_SSE2 void SplitARGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width) { asm volatile( "sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F "movdqu 16(%0),%%xmm1 \n" // 10-1F "movdqa %%xmm0,%%xmm2 \n" "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) "movdqa %%xmm0,%%xmm2 \n" "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) "movdqa %%xmm0,%%xmm2 \n" "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) "movlps %%xmm0,(%1,%3) \n" // B "movhps %%xmm0,(%1,%2) \n" // G "movlps %%xmm2,(%1) \n" // R "movhps %%xmm2,(%1,%4) \n" // A "lea 32(%0),%0 \n" "lea 8(%1),%1 \n" "sub $0x8,%5 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(dst_a), // %4 "+rm"(width) // %5 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif #ifdef HAS_SPLITXRGBROW_SSE2 void SplitXRGBRow_SSE2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { asm volatile( LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F "movdqu 16(%0),%%xmm1 \n" // 10-1F "movdqa %%xmm0,%%xmm2 \n" "punpcklqdq %%xmm1,%%xmm0 \n" // 00-07 10-17 "punpckhqdq %%xmm1,%%xmm2 \n" // 08-0F 18-1F "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" // 08192A3B4C5D6E7F (lo) "punpckhbw %%xmm2,%%xmm1 \n" // 08192A3B4C5D6E7F (hi) "movdqa %%xmm0,%%xmm2 \n" "punpcklqdq %%xmm1,%%xmm0 \n" // 08192A3B08192A3B "punpckhqdq %%xmm1,%%xmm2 \n" // 4C5D6E7F4C5D6E7F "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" // 048C159D26AE37BF (lo) "punpckhbw %%xmm2,%%xmm1 \n" // 048C159D26AE37BF (hi) "movdqa %%xmm0,%%xmm2 \n" "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) "movlps %%xmm0,(%3) \n" // B "movhps %%xmm0,(%2) \n" // G "movlps %%xmm2,(%1) \n" // R "lea 32(%0),%0 \n" "lea 8(%1),%1 \n" "lea 8(%2),%2 \n" "lea 8(%3),%3 \n" "sub $0x8,%4 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+rm"(width) // %4 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; #ifdef HAS_SPLITARGBROW_SSSE3 void SplitARGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width) { asm volatile( "movdqa %6,%%xmm3 \n" "sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F "movdqu 16(%0),%%xmm1 \n" // 10-1F "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) "movdqa %%xmm0,%%xmm2 \n" "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) "movlps %%xmm0,(%1,%3) \n" // B "movhps %%xmm0,(%1,%2) \n" // G "movlps %%xmm2,(%1) \n" // R "movhps %%xmm2,(%1,%4) \n" // A "lea 32(%0),%0 \n" "lea 8(%1),%1 \n" "subl $0x8,%5 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(dst_a), // %4 #if defined(__i386__) "+m"(width) // %5 #else "+rm"(width) // %5 #endif : "m"(kShuffleMaskARGBSplit) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } #endif #ifdef HAS_SPLITXRGBROW_SSSE3 void SplitXRGBRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { asm volatile( "movdqa %5,%%xmm3 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" // 00-0F "movdqu 16(%0),%%xmm1 \n" // 10-1F "pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo) "pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi) "movdqa %%xmm0,%%xmm2 \n" "punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG) "punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA) "movlps %%xmm0,(%3) \n" // B "movhps %%xmm0,(%2) \n" // G "movlps %%xmm2,(%1) \n" // R "lea 32(%0),%0 \n" "lea 8(%1),%1 \n" "lea 8(%2),%2 \n" "lea 8(%3),%3 \n" "sub $0x8,%4 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(width) // %4 : "m"(kShuffleMaskARGBSplit) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } #endif #ifdef HAS_SPLITARGBROW_AVX2 static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7}; void SplitARGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width) { asm volatile( "sub %1,%2 \n" "sub %1,%3 \n" "sub %1,%4 \n" "vmovdqa %7,%%ymm3 \n" "vbroadcastf128 %6,%%ymm4 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" // 00-0F "vmovdqu 16(%0),%%xmm1 \n" // 10-1F "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" "vpermd %%ymm0,%%ymm3,%%ymm0 \n" "vpermd %%ymm1,%%ymm3,%%ymm1 \n" "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR "vmovdqu %%xmm0,(%1,%3) \n" // B "vextracti128 $1,%%ymm0,(%1) \n" // R "vmovdqu %%xmm2,(%1,%2) \n" // G "vextracti128 $1,%%ymm2,(%1,%4) \n" // A "lea 64(%0),%0 \n" "lea 16(%1),%1 \n" "subl $0x10,%5 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(dst_a), // %4 #if defined(__i386__) "+m"(width) // %5 #else "+rm"(width) // %5 #endif : "m"(kShuffleMaskARGBSplit), // %6 "m"(kShuffleMaskARGBPermute) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif #ifdef HAS_SPLITXRGBROW_AVX2 void SplitXRGBRow_AVX2(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { asm volatile( "vmovdqa %6,%%ymm3 \n" "vbroadcastf128 %5,%%ymm4 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" // 00-0F "vmovdqu 16(%0),%%xmm1 \n" // 10-1F "vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F "vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" "vpermd %%ymm0,%%ymm3,%%ymm0 \n" "vpermd %%ymm1,%%ymm3,%%ymm1 \n" "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" // BR "vmovdqu %%xmm0,(%3) \n" // B "vextracti128 $1,%%ymm0,(%1) \n" // R "vmovdqu %%xmm2,(%2) \n" // G "lea 64(%0),%0 \n" "lea 16(%1),%1 \n" "lea 16(%2),%2 \n" "lea 16(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(width) // %4 : "m"(kShuffleMaskARGBSplit), // %5 "m"(kShuffleMaskARGBPermute) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif #ifdef HAS_MERGEXR30ROW_AVX2 void MergeXR30Row_AVX2(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_ar30, int depth, int width) { int shift = depth - 10; asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants "vpsrlw $14,%%ymm5,%%ymm5 \n" "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" "vpsrlw $6,%%ymm6,%%ymm6 \n" "vmovd %5,%%xmm4 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0,%1),%%ymm1 \n" "vmovdqu (%0,%2),%%ymm2 \n" "vpsrlw %%xmm4,%%ymm0,%%ymm0 \n" "vpsrlw %%xmm4,%%ymm1,%%ymm1 \n" "vpsrlw %%xmm4,%%ymm2,%%ymm2 \n" "vpminuw %%ymm0,%%ymm6,%%ymm0 \n" "vpminuw %%ymm1,%%ymm6,%%ymm1 \n" "vpminuw %%ymm2,%%ymm6,%%ymm2 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm1,%%ymm1 \n" "vpermq $0xd8,%%ymm2,%%ymm2 \n" "vpsllw $0x4,%%ymm0,%%ymm0 \n" // Shift R to target bit "vpunpckhwd %%ymm0,%%ymm2,%%ymm3 \n" // RB "vpunpcklwd %%ymm0,%%ymm2,%%ymm0 \n" "vpunpckhwd %%ymm5,%%ymm1,%%ymm2 \n" // AG "vpunpcklwd %%ymm5,%%ymm1,%%ymm1 \n" "vpslld $0xa,%%ymm1,%%ymm1 \n" // Shift AG to target bit "vpslld $0xa,%%ymm2,%%ymm2 \n" "vpor %%ymm1,%%ymm0,%%ymm0 \n" // Combine "vpor %%ymm2,%%ymm3,%%ymm3 \n" "vmovdqu %%ymm0,(%3) \n" "vmovdqu %%ymm3,0x20(%3) \n" "lea 0x20(%0),%0 \n" "lea 0x40(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_ar30), // %3 "+r"(width) // %4 #if defined(__i386__) : "m"(shift) // %5 #else : "rm"(shift) // %5 #endif : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif #ifdef HAS_MERGEAR64ROW_AVX2 static const lvec32 MergeAR64Permute = {0, 4, 2, 6, 1, 5, 3, 7}; void MergeAR64Row_AVX2(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint16_t* dst_ar64, int depth, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; mask = (mask << 16) + mask; asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" "vmovdqa %8,%%ymm5 \n" "vmovd %6,%%xmm6 \n" "vbroadcastss %7,%%ymm7 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" // R "vmovdqu (%0,%1),%%ymm1 \n" // G "vmovdqu (%0,%2),%%ymm2 \n" // B "vmovdqu (%0,%3),%%ymm3 \n" // A "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" "vpminuw %%ymm3,%%ymm7,%%ymm3 \n" "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" "vpsllw %%xmm6,%%ymm3,%%ymm3 \n" "vpermd %%ymm0,%%ymm5,%%ymm0 \n" "vpermd %%ymm1,%%ymm5,%%ymm1 \n" "vpermd %%ymm2,%%ymm5,%%ymm2 \n" "vpermd %%ymm3,%%ymm5,%%ymm3 \n" "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) "vmovdqu %%ymm3,(%4) \n" "vmovdqu %%ymm2,0x20(%4) \n" "vmovdqu %%ymm4,0x40(%4) \n" "vmovdqu %%ymm1,0x60(%4) \n" "lea 0x20(%0),%0 \n" "lea 0x80(%4),%4 \n" "subl $0x10,%5 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(src_a), // %3 "+r"(dst_ar64), // %4 #if defined(__i386__) "+m"(width) // %5 #else "+rm"(width) // %5 #endif : "m"(shift), // %6 "m"(mask), // %7 "m"(MergeAR64Permute) // %8 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_MERGEXR64ROW_AVX2 void MergeXR64Row_AVX2(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint16_t* dst_ar64, int depth, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; mask = (mask << 16) + mask; asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "vmovdqa %7,%%ymm5 \n" "vmovd %5,%%xmm6 \n" "vbroadcastss %6,%%ymm7 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" // R "vmovdqu (%0,%1),%%ymm1 \n" // G "vmovdqu (%0,%2),%%ymm2 \n" // B "vpminuw %%ymm0,%%ymm7,%%ymm0 \n" "vpminuw %%ymm1,%%ymm7,%%ymm1 \n" "vpminuw %%ymm2,%%ymm7,%%ymm2 \n" "vpsllw %%xmm6,%%ymm0,%%ymm0 \n" "vpsllw %%xmm6,%%ymm1,%%ymm1 \n" "vpsllw %%xmm6,%%ymm2,%%ymm2 \n" "vpermd %%ymm0,%%ymm5,%%ymm0 \n" "vpermd %%ymm1,%%ymm5,%%ymm1 \n" "vpermd %%ymm2,%%ymm5,%%ymm2 \n" "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" // A (0xffff) "vpunpcklwd %%ymm1,%%ymm2,%%ymm4 \n" // BG(low) "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n" // BG(hi) "vpunpcklwd %%ymm3,%%ymm0,%%ymm2 \n" // RA(low) "vpunpckhwd %%ymm3,%%ymm0,%%ymm0 \n" // RA(hi) "vpunpckldq %%ymm2,%%ymm4,%%ymm3 \n" // BGRA(1) "vpunpckhdq %%ymm2,%%ymm4,%%ymm4 \n" // BGRA(3) "vpunpckldq %%ymm0,%%ymm1,%%ymm2 \n" // BGRA(2) "vpunpckhdq %%ymm0,%%ymm1,%%ymm1 \n" // BGRA(4) "vmovdqu %%ymm3,(%3) \n" "vmovdqu %%ymm2,0x20(%3) \n" "vmovdqu %%ymm4,0x40(%3) \n" "vmovdqu %%ymm1,0x60(%3) \n" "lea 0x20(%0),%0 \n" "lea 0x80(%3),%3 \n" "subl $0x10,%4 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_ar64), // %3 "+r"(width) // %4 : "m"(shift), // %5 "m"(mask), // %6 "m"(MergeAR64Permute) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_MERGEARGB16TO8ROW_AVX2 static const uvec8 MergeARGB16To8Shuffle = {0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15}; void MergeARGB16To8Row_AVX2(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint8_t* dst_argb, int depth, int width) { int shift = depth - 8; asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" "vbroadcastf128 %7,%%ymm5 \n" "vmovd %6,%%xmm6 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" // R "vmovdqu (%0,%1),%%ymm1 \n" // G "vmovdqu (%0,%2),%%ymm2 \n" // B "vmovdqu (%0,%3),%%ymm3 \n" // A "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" "vpsrlw %%xmm6,%%ymm3,%%ymm3 \n" "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) "vpermq $0xd8,%%ymm1,%%ymm1 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) "vmovdqu %%ymm2,(%4) \n" "vmovdqu %%ymm0,0x20(%4) \n" "lea 0x20(%0),%0 \n" "lea 0x40(%4),%4 \n" "subl $0x10,%5 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(src_a), // %3 "+r"(dst_argb), // %4 #if defined(__i386__) "+m"(width) // %5 #else "+rm"(width) // %5 #endif : "m"(shift), // %6 "m"(MergeARGB16To8Shuffle) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif #ifdef HAS_MERGEXRGB16TO8ROW_AVX2 void MergeXRGB16To8Row_AVX2(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_argb, int depth, int width) { int shift = depth - 8; asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "vbroadcastf128 %6,%%ymm5 \n" "vmovd %5,%%xmm6 \n" "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n" "vpsrlw $8,%%ymm3,%%ymm3 \n" // A (0xff) LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" // R "vmovdqu (%0,%1),%%ymm1 \n" // G "vmovdqu (%0,%2),%%ymm2 \n" // B "vpsrlw %%xmm6,%%ymm0,%%ymm0 \n" "vpsrlw %%xmm6,%%ymm1,%%ymm1 \n" "vpsrlw %%xmm6,%%ymm2,%%ymm2 \n" "vpackuswb %%ymm1,%%ymm2,%%ymm1 \n" // BG (planar) "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" // RA (planar) "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" // BG (interleave) "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // RA (interleave) "vpermq $0xd8,%%ymm1,%%ymm1 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpunpcklwd %%ymm0,%%ymm1,%%ymm2 \n" // BGRA (low) "vpunpckhwd %%ymm0,%%ymm1,%%ymm0 \n" // BGRA (hi) "vmovdqu %%ymm2,(%3) \n" "vmovdqu %%ymm0,0x20(%3) \n" "lea 0x20(%0),%0 \n" "lea 0x40(%3),%3 \n" "subl $0x10,%4 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 : "m"(shift), // %5 "m"(MergeARGB16To8Shuffle) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif #ifdef HAS_COPYROW_SSE2 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "test $0xf,%0 \n" "jne 2f \n" "test $0xf,%1 \n" "jne 2f \n" LABELALIGN "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "movdqa %%xmm0,(%1) \n" "movdqa %%xmm1,0x10(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "jmp 9f \n" LABELALIGN "2: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "movdqu %%xmm0,(%1) \n" "movdqu %%xmm1,0x10(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 2b \n" LABELALIGN "9: \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_AVX void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { asm volatile( LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x40,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_COPYROW_AVX #ifdef HAS_COPYROW_ERMS // Multiple of 1. void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); asm volatile( "rep movsb \n" : "+S"(src), // %0 "+D"(dst), // %1 "+c"(width_tmp) // %2 : : "memory", "cc"); } #endif // HAS_COPYROW_ERMS #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "pcmpeqb %%xmm0,%%xmm0 \n" "pslld $0x18,%%xmm0 \n" "pcmpeqb %%xmm1,%%xmm1 \n" "psrld $0x8,%%xmm1 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm2 \n" "movdqu 0x10(%0),%%xmm3 \n" "lea 0x20(%0),%0 \n" "movdqu (%1),%%xmm4 \n" "movdqu 0x10(%1),%%xmm5 \n" "pand %%xmm0,%%xmm2 \n" "pand %%xmm0,%%xmm3 \n" "pand %%xmm1,%%xmm4 \n" "pand %%xmm1,%%xmm5 \n" "por %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm3 \n" "movdqu %%xmm2,(%1) \n" "movdqu %%xmm3,0x10(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBCOPYALPHAROW_SSE2 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm2 \n" "lea 0x40(%0),%0 \n" "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" "vmovdqu %%ymm1,(%1) \n" "vmovdqu %%ymm2,0x20(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBCOPYALPHAROW_AVX2 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 // width in pixels void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, uint8_t* dst_a, int width) { asm volatile( LABELALIGN "1: \n" "movdqu (%0), %%xmm0 \n" "movdqu 0x10(%0), %%xmm1 \n" "lea 0x20(%0), %0 \n" "psrld $0x18, %%xmm0 \n" "psrld $0x18, %%xmm1 \n" "packssdw %%xmm1, %%xmm0 \n" "packuswb %%xmm0, %%xmm0 \n" "movq %%xmm0,(%1) \n" "lea 0x8(%1), %1 \n" "sub $0x8, %2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+rm"(width) // %2 : : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 static const uvec8 kShuffleAlphaShort_AVX2 = { 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u, 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u}; void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width) { asm volatile( "vmovdqa %3,%%ymm4 \n" "vbroadcastf128 %4,%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu (%0), %%ymm0 \n" "vmovdqu 0x20(%0), %%ymm1 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" "vmovdqu 0x40(%0), %%ymm2 \n" "vmovdqu 0x60(%0), %%ymm3 \n" "lea 0x80(%0), %0 \n" "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates. "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate. "vmovdqu %%ymm0,(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x20, %2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+rm"(width) // %2 : "m"(kPermdARGBToY_AVX), // %3 "m"(kShuffleAlphaShort_AVX2) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBEXTRACTALPHAROW_AVX2 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "pcmpeqb %%xmm0,%%xmm0 \n" "pslld $0x18,%%xmm0 \n" "pcmpeqb %%xmm1,%%xmm1 \n" "psrld $0x8,%%xmm1 \n" LABELALIGN "1: \n" "movq (%0),%%xmm2 \n" "lea 0x8(%0),%0 \n" "punpcklbw %%xmm2,%%xmm2 \n" "punpckhwd %%xmm2,%%xmm3 \n" "punpcklwd %%xmm2,%%xmm2 \n" "movdqu (%1),%%xmm4 \n" "movdqu 0x10(%1),%%xmm5 \n" "pand %%xmm0,%%xmm2 \n" "pand %%xmm0,%%xmm3 \n" "pand %%xmm1,%%xmm4 \n" "pand %%xmm1,%%xmm5 \n" "por %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm3 \n" "movdqu %%xmm2,(%1) \n" "movdqu %%xmm3,0x10(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" "vpsrld $0x8,%%ymm0,%%ymm0 \n" LABELALIGN "1: \n" "vpmovzxbd (%0),%%ymm1 \n" "vpmovzxbd 0x8(%0),%%ymm2 \n" "lea 0x10(%0),%0 \n" "vpslld $0x18,%%ymm1,%%ymm1 \n" "vpslld $0x18,%%ymm2,%%ymm2 \n" "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1 \n" "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2 \n" "vmovdqu %%ymm1,(%1) \n" "vmovdqu %%ymm2,0x20(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width >> 2); const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. asm volatile( "rep stosl \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 : "memory", "cc"); } void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); asm volatile( "rep stosb \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v8) // %2 : "memory", "cc"); } void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); asm volatile( "rep stosl \n" : "+D"(dst_argb), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 : "memory", "cc"); } #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_SSE2 void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm5"); } void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x00(%0,%4,1),%%xmm2 \n" "movdqu 0x10(%0,%4,1),%%xmm3 \n" "lea 0x20(%0),%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm5,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" "movq %%xmm0,(%1) \n" "movq %%xmm1,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : "r"((intptr_t)(stride_yuy2)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm5,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" "movq %%xmm0,(%1) \n" "movq %%xmm1,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm5"); } void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1"); } void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, int stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x00(%0,%4,1),%%xmm2 \n" "movdqu 0x10(%0,%4,1),%%xmm3 \n" "lea 0x20(%0),%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm5,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" "movq %%xmm0,(%1) \n" "movq %%xmm1,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : "r"((intptr_t)(stride_uyvy)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pand %%xmm5,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" "movq %%xmm0,(%1) \n" "movq %%xmm1,0x00(%1,%2,1) \n" "lea 0x8(%1),%1 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_YUY2TOYROW_AVX2 void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpand %%ymm5,%%ymm0,%%ymm0 \n" "vpand %%ymm5,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm5"); } void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpand %%ymm5,%%ymm0,%%ymm1 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm1,%%ymm1 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : "r"((intptr_t)(stride_yuy2)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm5"); } void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpand %%ymm5,%%ymm0,%%ymm1 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm1,%%ymm1 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm5"); } void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm5"); } void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, int stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n" "vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpand %%ymm5,%%ymm0,%%ymm0 \n" "vpand %%ymm5,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpand %%ymm5,%%ymm0,%%ymm1 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm1,%%ymm1 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : "r"((intptr_t)(stride_uyvy)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm5"); } void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrlw $0x8,%%ymm5,%%ymm5 \n" "sub %1,%2 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpand %%ymm5,%%ymm0,%%ymm0 \n" "vpand %%ymm5,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpand %%ymm5,%%ymm0,%%ymm1 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm1,%%ymm1 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vextractf128 $0x0,%%ymm1,(%1) \n" "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1) \n" "lea 0x10(%1),%1 \n" "sub $0x20,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_YUY2TOYROW_AVX2 #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $0xf,%%xmm7 \n" "pcmpeqb %%xmm6,%%xmm6 \n" "psrlw $0x8,%%xmm6 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "psllw $0x8,%%xmm5 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "pslld $0x18,%%xmm4 \n" "sub $0x4,%3 \n" "jl 49f \n" // 4 pixel loop. LABELALIGN "40: \n" "movdqu (%0),%%xmm3 \n" "lea 0x10(%0),%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" "movdqu (%1),%%xmm2 \n" "pshufb %4,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" "movdqu (%1),%%xmm1 \n" "lea 0x10(%1),%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "pmullw %%xmm3,%%xmm1 \n" "psrlw $0x8,%%xmm2 \n" "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jge 40b \n" "49: \n" "add $0x3,%3 \n" "jl 99f \n" // 1 pixel loop. "91: \n" "movd (%0),%%xmm3 \n" "lea 0x4(%0),%0 \n" "movdqa %%xmm3,%%xmm0 \n" "pxor %%xmm4,%%xmm3 \n" "movd (%1),%%xmm2 \n" "pshufb %4,%%xmm3 \n" "pand %%xmm6,%%xmm2 \n" "paddw %%xmm7,%%xmm3 \n" "pmullw %%xmm3,%%xmm2 \n" "movd (%1),%%xmm1 \n" "lea 0x4(%1),%1 \n" "psrlw $0x8,%%xmm1 \n" "por %%xmm4,%%xmm0 \n" "pmullw %%xmm3,%%xmm1 \n" "psrlw $0x8,%%xmm2 \n" "paddusb %%xmm2,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "movd %%xmm0,(%2) \n" "lea 0x4(%2),%2 \n" "sub $0x1,%3 \n" "jge 91b \n" "99: \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : "m"(kShuffleAlpha) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBBLENDROW_SSSE3 #ifdef HAS_BLENDPLANEROW_SSSE3 // Blend 8 pixels at a time. // unsigned version of math // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 void BlendPlaneRow_SSSE3(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width) { asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psllw $0x8,%%xmm5 \n" "mov $0x80808080,%%eax \n" "movd %%eax,%%xmm6 \n" "pshufd $0x0,%%xmm6,%%xmm6 \n" "mov $0x807f807f,%%eax \n" "movd %%eax,%%xmm7 \n" "pshufd $0x0,%%xmm7,%%xmm7 \n" "sub %2,%0 \n" "sub %2,%1 \n" "sub %2,%3 \n" // 8 pixel loop. LABELALIGN "1: \n" "movq (%2),%%xmm0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "pxor %%xmm5,%%xmm0 \n" "movq (%0,%2,1),%%xmm1 \n" "movq (%1,%2,1),%%xmm2 \n" "punpcklbw %%xmm2,%%xmm1 \n" "psubb %%xmm6,%%xmm1 \n" "pmaddubsw %%xmm1,%%xmm0 \n" "paddw %%xmm7,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movq %%xmm0,(%3,%2,1) \n" "lea 0x8(%2),%2 \n" "sub $0x8,%4 \n" "jg 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(alpha), // %2 "+r"(dst), // %3 "+rm"(width) // %4 ::"memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); } #endif // HAS_BLENDPLANEROW_SSSE3 #ifdef HAS_BLENDPLANEROW_AVX2 // Blend 32 pixels at a time. // unsigned version of math // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 void BlendPlaneRow_AVX2(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width) { asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsllw $0x8,%%ymm5,%%ymm5 \n" "mov $0x80808080,%%eax \n" "vmovd %%eax,%%xmm6 \n" "vbroadcastss %%xmm6,%%ymm6 \n" "mov $0x807f807f,%%eax \n" "vmovd %%eax,%%xmm7 \n" "vbroadcastss %%xmm7,%%ymm7 \n" "sub %2,%0 \n" "sub %2,%1 \n" "sub %2,%3 \n" // 32 pixel loop. LABELALIGN "1: \n" "vmovdqu (%2),%%ymm0 \n" "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" "vpxor %%ymm5,%%ymm3,%%ymm3 \n" "vpxor %%ymm5,%%ymm0,%%ymm0 \n" "vmovdqu (%0,%2,1),%%ymm1 \n" "vmovdqu (%1,%2,1),%%ymm2 \n" "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" "vpsubb %%ymm6,%%ymm4,%%ymm4 \n" "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm3,%%ymm3 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%3,%2,1) \n" "lea 0x20(%2),%2 \n" "sub $0x20,%4 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(alpha), // %2 "+r"(dst), // %3 "+rm"(width) // %4 ::"memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_BLENDPLANEROW_AVX2 #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; // Attenuate 4 pixels at a time. void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( "pcmpeqb %%xmm3,%%xmm3 \n" "pslld $0x18,%%xmm3 \n" "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "pshufb %%xmm4,%%xmm0 \n" "movdqu (%0),%%xmm1 \n" "punpcklbw %%xmm1,%%xmm1 \n" "pmulhuw %%xmm1,%%xmm0 \n" "movdqu (%0),%%xmm1 \n" "pshufb %%xmm5,%%xmm1 \n" "movdqu (%0),%%xmm2 \n" "punpckhbw %%xmm2,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm1 \n" "movdqu (%0),%%xmm2 \n" "lea 0x10(%0),%0 \n" "pand %%xmm3,%%xmm2 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "por %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "m"(kShuffleAlpha0), // %3 "m"(kShuffleAlpha1) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_AVX2 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u}; // Attenuate 8 pixels at a time. void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( "vbroadcastf128 %3,%%ymm4 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpslld $0x18,%%ymm5,%%ymm5 \n" "sub %0,%1 \n" // 8 pixel loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm6 \n" "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" "vpand %%ymm5,%%ymm6,%%ymm6 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpor %%ymm6,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,0x00(%0,%1,1) \n" "lea 0x20(%0),%0 \n" "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "m"(kShuffleAlpha_AVX2) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBATTENUATEROW_AVX2 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uintptr_t alpha; asm volatile( // 4 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movzb 0x03(%0),%3 \n" "punpcklbw %%xmm0,%%xmm0 \n" "movd 0x00(%4,%3,4),%%xmm2 \n" "movzb 0x07(%0),%3 \n" "movd 0x00(%4,%3,4),%%xmm3 \n" "pshuflw $0x40,%%xmm2,%%xmm2 \n" "pshuflw $0x40,%%xmm3,%%xmm3 \n" "movlhps %%xmm3,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm0 \n" "movdqu (%0),%%xmm1 \n" "movzb 0x0b(%0),%3 \n" "punpckhbw %%xmm1,%%xmm1 \n" "movd 0x00(%4,%3,4),%%xmm2 \n" "movzb 0x0f(%0),%3 \n" "movd 0x00(%4,%3,4),%%xmm3 \n" "pshuflw $0x40,%%xmm2,%%xmm2 \n" "pshuflw $0x40,%%xmm3,%%xmm3 \n" "movlhps %%xmm3,%%xmm2 \n" "pmulhuw %%xmm2,%%xmm1 \n" "lea 0x10(%0),%0 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width), // %2 "=&r"(alpha) // %3 : "r"(fixed_invtbl8) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. static const uvec8 kUnattenShuffleAlpha_AVX2 = { 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; // Unattenuate 8 pixels at a time. void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uintptr_t alpha; asm volatile( "sub %0,%1 \n" "vbroadcastf128 %5,%%ymm5 \n" // 8 pixel loop. LABELALIGN "1: \n" // replace VPGATHER "movzb 0x03(%0),%3 \n" "vmovd 0x00(%4,%3,4),%%xmm0 \n" "movzb 0x07(%0),%3 \n" "vmovd 0x00(%4,%3,4),%%xmm1 \n" "movzb 0x0b(%0),%3 \n" "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n" "vmovd 0x00(%4,%3,4),%%xmm2 \n" "movzb 0x0f(%0),%3 \n" "vmovd 0x00(%4,%3,4),%%xmm3 \n" "movzb 0x13(%0),%3 \n" "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n" "vmovd 0x00(%4,%3,4),%%xmm0 \n" "movzb 0x17(%0),%3 \n" "vmovd 0x00(%4,%3,4),%%xmm1 \n" "movzb 0x1b(%0),%3 \n" "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n" "vmovd 0x00(%4,%3,4),%%xmm2 \n" "movzb 0x1f(%0),%3 \n" "vmovd 0x00(%4,%3,4),%%xmm3 \n" "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n" "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n" "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n" "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n" // end of VPGATHER "vmovdqu (%0),%%ymm6 \n" "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n" "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n" "vpshufb %%ymm5,%%ymm2,%%ymm2 \n" "vpshufb %%ymm5,%%ymm3,%%ymm3 \n" "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,0x00(%0,%1,1) \n" "lea 0x20(%0),%0 \n" "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width), // %2 "=&r"(alpha) // %3 : "r"(fixed_invtbl8), // %4 "m"(kUnattenShuffleAlpha_AVX2) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBUNATTENUATEROW_AVX2 #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "psubb %%xmm5,%%xmm0 \n" "psubb %%xmm5,%%xmm1 \n" "movdqu %%xmm4,%%xmm6 \n" "pmaddubsw %%xmm0,%%xmm6 \n" "movdqu %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm1,%%xmm0 \n" "phaddw %%xmm0,%%xmm6 \n" "paddw %%xmm5,%%xmm6 \n" "psrlw $0x8,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movdqu (%0),%%xmm2 \n" "movdqu 0x10(%0),%%xmm3 \n" "lea 0x20(%0),%0 \n" "psrld $0x18,%%xmm2 \n" "psrld $0x18,%%xmm3 \n" "packuswb %%xmm3,%%xmm2 \n" "packuswb %%xmm2,%%xmm2 \n" "movdqa %%xmm6,%%xmm3 \n" "punpcklbw %%xmm6,%%xmm6 \n" "punpcklbw %%xmm2,%%xmm3 \n" "movdqa %%xmm6,%%xmm1 \n" "punpcklwd %%xmm3,%%xmm6 \n" "punpckhwd %%xmm3,%%xmm1 \n" "movdqu %%xmm6,(%1) \n" "movdqu %%xmm1,0x10(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "m"(kARGBToYJ), // %3 "m"(kSub128) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBGRAYROW_SSSE3 #ifdef HAS_ARGBSEPIAROW_SSSE3 // b = (r * 35 + g * 68 + b * 17) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 // Constant for ARGB color to sepia tone static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0}; static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0}; static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { asm volatile( "movdqa %2,%%xmm2 \n" "movdqa %3,%%xmm3 \n" "movdqa %4,%%xmm4 \n" // 8 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm6 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm6 \n" "phaddw %%xmm6,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movdqu (%0),%%xmm5 \n" "movdqu 0x10(%0),%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm5 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm5,%%xmm0 \n" "movdqu (%0),%%xmm5 \n" "movdqu 0x10(%0),%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm5 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "phaddw %%xmm1,%%xmm5 \n" "psrlw $0x7,%%xmm5 \n" "packuswb %%xmm5,%%xmm5 \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x10(%0),%%xmm1 \n" "psrld $0x18,%%xmm6 \n" "psrld $0x18,%%xmm1 \n" "packuswb %%xmm1,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "punpcklbw %%xmm6,%%xmm5 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm5,%%xmm0 \n" "punpckhwd %%xmm5,%%xmm1 \n" "movdqu %%xmm0,(%0) \n" "movdqu %%xmm1,0x10(%0) \n" "lea 0x20(%0),%0 \n" "sub $0x8,%1 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "m"(kARGBToSepiaB), // %2 "m"(kARGBToSepiaG), // %3 "m"(kARGBToSepiaR) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBSEPIAROW_SSSE3 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 // Tranform 8 ARGB pixels (32 bytes) with color matrix. // Same as Sepia except matrix is provided. void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { asm volatile( "movdqu (%3),%%xmm5 \n" "pshufd $0x00,%%xmm5,%%xmm2 \n" "pshufd $0x55,%%xmm5,%%xmm3 \n" "pshufd $0xaa,%%xmm5,%%xmm4 \n" "pshufd $0xff,%%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm7 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "pmaddubsw %%xmm2,%%xmm7 \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x10(%0),%%xmm1 \n" "pmaddubsw %%xmm3,%%xmm6 \n" "pmaddubsw %%xmm3,%%xmm1 \n" "phaddsw %%xmm7,%%xmm0 \n" "phaddsw %%xmm1,%%xmm6 \n" "psraw $0x6,%%xmm0 \n" "psraw $0x6,%%xmm6 \n" "packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm6,%%xmm6 \n" "punpcklbw %%xmm6,%%xmm0 \n" "movdqu (%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm7 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm7 \n" "phaddsw %%xmm7,%%xmm1 \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x10(%0),%%xmm7 \n" "pmaddubsw %%xmm5,%%xmm6 \n" "pmaddubsw %%xmm5,%%xmm7 \n" "phaddsw %%xmm7,%%xmm6 \n" "psraw $0x6,%%xmm1 \n" "psraw $0x6,%%xmm6 \n" "packuswb %%xmm1,%%xmm1 \n" "packuswb %%xmm6,%%xmm6 \n" "punpcklbw %%xmm6,%%xmm1 \n" "movdqa %%xmm0,%%xmm6 \n" "punpcklwd %%xmm1,%%xmm0 \n" "punpckhwd %%xmm1,%%xmm6 \n" "movdqu %%xmm0,(%1) \n" "movdqu %%xmm6,0x10(%1) \n" "lea 0x20(%0),%0 \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(matrix_argb) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width) { asm volatile( "movd %2,%%xmm2 \n" "movd %3,%%xmm3 \n" "movd %4,%%xmm4 \n" "pshuflw $0x40,%%xmm2,%%xmm2 \n" "pshufd $0x44,%%xmm2,%%xmm2 \n" "pshuflw $0x40,%%xmm3,%%xmm3 \n" "pshufd $0x44,%%xmm3,%%xmm3 \n" "pshuflw $0x40,%%xmm4,%%xmm4 \n" "pshufd $0x44,%%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm6,%%xmm6 \n" "pslld $0x18,%%xmm6 \n" // 4 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "punpcklbw %%xmm5,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" "movdqu (%0),%%xmm1 \n" "punpckhbw %%xmm5,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm1 \n" "pmullw %%xmm3,%%xmm0 \n" "movdqu (%0),%%xmm7 \n" "pmullw %%xmm3,%%xmm1 \n" "pand %%xmm6,%%xmm7 \n" "paddw %%xmm4,%%xmm0 \n" "paddw %%xmm4,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "por %%xmm7,%%xmm0 \n" "movdqu %%xmm0,(%0) \n" "lea 0x10(%0),%0 \n" "sub $0x4,%1 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 "r"(interval_size), // %3 "r"(interval_offset) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBQUANTIZEROW_SSE2 #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. void ARGBShadeRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { asm volatile( "movd %3,%%xmm2 \n" "punpcklbw %%xmm2,%%xmm2 \n" "punpcklqdq %%xmm2,%%xmm2 \n" // 4 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm1 \n" "pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm1 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(value) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_ARGBSHADEROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" "movdqu (%1),%%xmm2 \n" "lea 0x10(%1),%1 \n" "movdqu %%xmm0,%%xmm1 \n" "movdqu %%xmm2,%%xmm3 \n" "punpcklbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm1 \n" "punpcklbw %%xmm5,%%xmm2 \n" "punpckhbw %%xmm5,%%xmm3 \n" "pmulhuw %%xmm2,%%xmm0 \n" "pmulhuw %%xmm3,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_ARGBMULTIPLYROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // 4 pixel loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm1 \n" "lea 0x20(%0),%0 \n" "vmovdqu (%1),%%ymm3 \n" "lea 0x20(%1),%1 \n" "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%2) \n" "lea 0x20(%2),%2 \n" "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_ARGBMULTIPLYROW_AVX2 #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 4 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" "movdqu (%1),%%xmm1 \n" "lea 0x10(%1),%1 \n" "paddusb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBADDROW_SSE2 #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 4 pixel loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "lea 0x20(%0),%0 \n" "vpaddusb (%1),%%ymm0,%%ymm0 \n" "lea 0x20(%1),%1 \n" "vmovdqu %%ymm0,(%2) \n" "lea 0x20(%2),%2 \n" "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0"); } #endif // HAS_ARGBADDROW_AVX2 #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels, 4 pixels at a time. void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 4 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "lea 0x10(%0),%0 \n" "movdqu (%1),%%xmm1 \n" "lea 0x10(%1),%1 \n" "psubusb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_ARGBSUBTRACTROW_SSE2 #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels, 8 pixels at a time. void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 4 pixel loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "lea 0x20(%0),%0 \n" "vpsubusb (%1),%%ymm0,%%ymm0 \n" "lea 0x20(%1),%1 \n" "vmovdqu %%ymm0,(%2) \n" "lea 0x20(%2),%2 \n" "sub $0x8,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0"); } #endif // HAS_ARGBSUBTRACTROW_AVX2 #ifdef HAS_SOBELXROW_SSE2 // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 void SobelXRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "sub %0,%3 \n" "pxor %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" "movq 0x2(%0),%%xmm1 \n" "punpcklbw %%xmm5,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm1 \n" "psubw %%xmm1,%%xmm0 \n" "movq 0x00(%0,%1,1),%%xmm1 \n" "movq 0x02(%0,%1,1),%%xmm2 \n" "punpcklbw %%xmm5,%%xmm1 \n" "punpcklbw %%xmm5,%%xmm2 \n" "psubw %%xmm2,%%xmm1 \n" "movq 0x00(%0,%2,1),%%xmm2 \n" "movq 0x02(%0,%2,1),%%xmm3 \n" "punpcklbw %%xmm5,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm3 \n" "psubw %%xmm3,%%xmm2 \n" "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm1,%%xmm0 \n" "paddw %%xmm1,%%xmm0 \n" "pxor %%xmm1,%%xmm1 \n" "psubw %%xmm0,%%xmm1 \n" "pmaxsw %%xmm1,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movq %%xmm0,0x00(%0,%3,1) \n" "lea 0x8(%0),%0 \n" "sub $0x8,%4 \n" "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 "+r"(dst_sobelx), // %3 "+r"(width) // %4 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELXROW_SSE2 #ifdef HAS_SOBELYROW_SSE2 // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 void SobelYRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { asm volatile( "sub %0,%1 \n" "sub %0,%2 \n" "pxor %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" "movq 0x00(%0,%1,1),%%xmm1 \n" "punpcklbw %%xmm5,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm1 \n" "psubw %%xmm1,%%xmm0 \n" "movq 0x1(%0),%%xmm1 \n" "movq 0x01(%0,%1,1),%%xmm2 \n" "punpcklbw %%xmm5,%%xmm1 \n" "punpcklbw %%xmm5,%%xmm2 \n" "psubw %%xmm2,%%xmm1 \n" "movq 0x2(%0),%%xmm2 \n" "movq 0x02(%0,%1,1),%%xmm3 \n" "punpcklbw %%xmm5,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm3 \n" "psubw %%xmm3,%%xmm2 \n" "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm1,%%xmm0 \n" "paddw %%xmm1,%%xmm0 \n" "pxor %%xmm1,%%xmm1 \n" "psubw %%xmm0,%%xmm1 \n" "pmaxsw %%xmm1,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movq %%xmm0,0x00(%0,%2,1) \n" "lea 0x8(%0),%0 \n" "sub $0x8,%3 \n" "jg 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELYROW_SSE2 #ifdef HAS_SOBELROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. // A = 255 // R = Sobel // G = Sobel // B = Sobel void SobelRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { asm volatile( "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" "lea 0x10(%0),%0 \n" "paddusb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm2 \n" "punpcklbw %%xmm0,%%xmm2 \n" "punpckhbw %%xmm0,%%xmm0 \n" "movdqa %%xmm2,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm1 \n" "punpckhwd %%xmm2,%%xmm2 \n" "por %%xmm5,%%xmm1 \n" "por %%xmm5,%%xmm2 \n" "movdqa %%xmm0,%%xmm3 \n" "punpcklwd %%xmm0,%%xmm3 \n" "punpckhwd %%xmm0,%%xmm0 \n" "por %%xmm5,%%xmm3 \n" "por %%xmm5,%%xmm0 \n" "movdqu %%xmm1,(%2) \n" "movdqu %%xmm2,0x10(%2) \n" "movdqu %%xmm3,0x20(%2) \n" "movdqu %%xmm0,0x30(%2) \n" "lea 0x40(%2),%2 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SOBELROW_SSE2 #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { asm volatile( "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" "lea 0x10(%0),%0 \n" "paddusb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" "lea 0x10(%2),%2 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1"); } #endif // HAS_SOBELTOPLANEROW_SSE2 #ifdef HAS_SOBELXYROW_SSE2 // Mixes Sobel X, Sobel Y and Sobel into ARGB. // A = 255 // R = Sobel X // G = Sobel // B = Sobel Y void SobelXYRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { asm volatile( "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" // 8 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%1,1),%%xmm1 \n" "lea 0x10(%0),%0 \n" "movdqa %%xmm0,%%xmm2 \n" "paddusb %%xmm1,%%xmm2 \n" "movdqa %%xmm0,%%xmm3 \n" "punpcklbw %%xmm5,%%xmm3 \n" "punpckhbw %%xmm5,%%xmm0 \n" "movdqa %%xmm1,%%xmm4 \n" "punpcklbw %%xmm2,%%xmm4 \n" "punpckhbw %%xmm2,%%xmm1 \n" "movdqa %%xmm4,%%xmm6 \n" "punpcklwd %%xmm3,%%xmm6 \n" "punpckhwd %%xmm3,%%xmm4 \n" "movdqa %%xmm1,%%xmm7 \n" "punpcklwd %%xmm0,%%xmm7 \n" "punpckhwd %%xmm0,%%xmm1 \n" "movdqu %%xmm6,(%2) \n" "movdqu %%xmm4,0x10(%2) \n" "movdqu %%xmm7,0x20(%2) \n" "movdqu %%xmm1,0x30(%2) \n" "lea 0x40(%2),%2 \n" "sub $0x10,%3 \n" "jg 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_SOBELXYROW_SSE2 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value, inclusive of the value. void ComputeCumulativeSumRow_SSE2(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width) { asm volatile( "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm1,%%xmm1 \n" "sub $0x4,%3 \n" "jl 49f \n" "test $0xf,%1 \n" "jne 49f \n" // 4 pixel loop. LABELALIGN "40: \n" "movdqu (%0),%%xmm2 \n" "lea 0x10(%0),%0 \n" "movdqa %%xmm2,%%xmm4 \n" "punpcklbw %%xmm1,%%xmm2 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklwd %%xmm1,%%xmm2 \n" "punpckhwd %%xmm1,%%xmm3 \n" "punpckhbw %%xmm1,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" "punpcklwd %%xmm1,%%xmm4 \n" "punpckhwd %%xmm1,%%xmm5 \n" "paddd %%xmm2,%%xmm0 \n" "movdqu (%2),%%xmm2 \n" "paddd %%xmm0,%%xmm2 \n" "paddd %%xmm3,%%xmm0 \n" "movdqu 0x10(%2),%%xmm3 \n" "paddd %%xmm0,%%xmm3 \n" "paddd %%xmm4,%%xmm0 \n" "movdqu 0x20(%2),%%xmm4 \n" "paddd %%xmm0,%%xmm4 \n" "paddd %%xmm5,%%xmm0 \n" "movdqu 0x30(%2),%%xmm5 \n" "lea 0x40(%2),%2 \n" "paddd %%xmm0,%%xmm5 \n" "movdqu %%xmm2,(%1) \n" "movdqu %%xmm3,0x10(%1) \n" "movdqu %%xmm4,0x20(%1) \n" "movdqu %%xmm5,0x30(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x4,%3 \n" "jge 40b \n" "49: \n" "add $0x3,%3 \n" "jl 19f \n" // 1 pixel loop. LABELALIGN "10: \n" "movd (%0),%%xmm2 \n" "lea 0x4(%0),%0 \n" "punpcklbw %%xmm1,%%xmm2 \n" "punpcklwd %%xmm1,%%xmm2 \n" "paddd %%xmm2,%%xmm0 \n" "movdqu (%2),%%xmm2 \n" "lea 0x10(%2),%2 \n" "paddd %%xmm0,%%xmm2 \n" "movdqu %%xmm2,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x1,%3 \n" "jge 10b \n" "19: \n" : "+r"(row), // %0 "+r"(cumsum), // %1 "+r"(previous_cumsum), // %2 "+r"(width) // %3 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, const int32_t* botleft, int width, int area, uint8_t* dst, int count) { asm volatile( "movd %5,%%xmm5 \n" "cvtdq2ps %%xmm5,%%xmm5 \n" "rcpss %%xmm5,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "sub $0x4,%3 \n" "jl 49f \n" "cmpl $0x80,%5 \n" "ja 40f \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "pcmpeqb %%xmm6,%%xmm6 \n" "psrld $0x10,%%xmm6 \n" "cvtdq2ps %%xmm6,%%xmm6 \n" "addps %%xmm6,%%xmm5 \n" "mulps %%xmm4,%%xmm5 \n" "cvtps2dq %%xmm5,%%xmm5 \n" "packssdw %%xmm5,%%xmm5 \n" // 4 pixel small loop. LABELALIGN "4: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "movdqu 0x30(%0),%%xmm3 \n" "psubd 0x00(%0,%4,4),%%xmm0 \n" "psubd 0x10(%0,%4,4),%%xmm1 \n" "psubd 0x20(%0,%4,4),%%xmm2 \n" "psubd 0x30(%0,%4,4),%%xmm3 \n" "lea 0x40(%0),%0 \n" "psubd (%1),%%xmm0 \n" "psubd 0x10(%1),%%xmm1 \n" "psubd 0x20(%1),%%xmm2 \n" "psubd 0x30(%1),%%xmm3 \n" "paddd 0x00(%1,%4,4),%%xmm0 \n" "paddd 0x10(%1,%4,4),%%xmm1 \n" "paddd 0x20(%1,%4,4),%%xmm2 \n" "paddd 0x30(%1,%4,4),%%xmm3 \n" "lea 0x40(%1),%1 \n" "packssdw %%xmm1,%%xmm0 \n" "packssdw %%xmm3,%%xmm2 \n" "pmulhuw %%xmm5,%%xmm0 \n" "pmulhuw %%xmm5,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jge 4b \n" "jmp 49f \n" // 4 pixel loop LABELALIGN "40: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x20(%0),%%xmm2 \n" "movdqu 0x30(%0),%%xmm3 \n" "psubd 0x00(%0,%4,4),%%xmm0 \n" "psubd 0x10(%0,%4,4),%%xmm1 \n" "psubd 0x20(%0,%4,4),%%xmm2 \n" "psubd 0x30(%0,%4,4),%%xmm3 \n" "lea 0x40(%0),%0 \n" "psubd (%1),%%xmm0 \n" "psubd 0x10(%1),%%xmm1 \n" "psubd 0x20(%1),%%xmm2 \n" "psubd 0x30(%1),%%xmm3 \n" "paddd 0x00(%1,%4,4),%%xmm0 \n" "paddd 0x10(%1,%4,4),%%xmm1 \n" "paddd 0x20(%1,%4,4),%%xmm2 \n" "paddd 0x30(%1,%4,4),%%xmm3 \n" "lea 0x40(%1),%1 \n" "cvtdq2ps %%xmm0,%%xmm0 \n" "cvtdq2ps %%xmm1,%%xmm1 \n" "mulps %%xmm4,%%xmm0 \n" "mulps %%xmm4,%%xmm1 \n" "cvtdq2ps %%xmm2,%%xmm2 \n" "cvtdq2ps %%xmm3,%%xmm3 \n" "mulps %%xmm4,%%xmm2 \n" "mulps %%xmm4,%%xmm3 \n" "cvtps2dq %%xmm0,%%xmm0 \n" "cvtps2dq %%xmm1,%%xmm1 \n" "cvtps2dq %%xmm2,%%xmm2 \n" "cvtps2dq %%xmm3,%%xmm3 \n" "packssdw %%xmm1,%%xmm0 \n" "packssdw %%xmm3,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jge 40b \n" "49: \n" "add $0x3,%3 \n" "jl 19f \n" // 1 pixel loop LABELALIGN "10: \n" "movdqu (%0),%%xmm0 \n" "psubd 0x00(%0,%4,4),%%xmm0 \n" "lea 0x10(%0),%0 \n" "psubd (%1),%%xmm0 \n" "paddd 0x00(%1,%4,4),%%xmm0 \n" "lea 0x10(%1),%1 \n" "cvtdq2ps %%xmm0,%%xmm0 \n" "mulps %%xmm4,%%xmm0 \n" "cvtps2dq %%xmm0,%%xmm0 \n" "packssdw %%xmm0,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movd %%xmm0,(%2) \n" "lea 0x4(%2),%2 \n" "sub $0x1,%3 \n" "jge 10b \n" "19: \n" : "+r"(topleft), // %0 "+r"(botleft), // %1 "+r"(dst), // %2 "+rm"(count) // %3 : "r"((intptr_t)(width)), // %4 "rm"(area) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, int src_argb_stride, uint8_t* dst_argb, const float* src_dudv, int width) { intptr_t src_argb_stride_temp = src_argb_stride; intptr_t temp; asm volatile( "movq (%3),%%xmm2 \n" "movq 0x08(%3),%%xmm7 \n" "shl $0x10,%1 \n" "add $0x4,%1 \n" "movd %1,%%xmm5 \n" "sub $0x4,%4 \n" "jl 49f \n" "pshufd $0x44,%%xmm7,%%xmm7 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "movdqa %%xmm2,%%xmm0 \n" "addps %%xmm7,%%xmm0 \n" "movlhps %%xmm0,%%xmm2 \n" "movdqa %%xmm7,%%xmm4 \n" "addps %%xmm4,%%xmm4 \n" "movdqa %%xmm2,%%xmm3 \n" "addps %%xmm4,%%xmm3 \n" "addps %%xmm4,%%xmm4 \n" // 4 pixel loop LABELALIGN "40: \n" "cvttps2dq %%xmm2,%%xmm0 \n" // x,y float->int first 2 "cvttps2dq %%xmm3,%%xmm1 \n" // x,y float->int next 2 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts "pmaddwd %%xmm5,%%xmm0 \n" // off = x*4 + y*stride "movd %%xmm0,%k1 \n" "pshufd $0x39,%%xmm0,%%xmm0 \n" "movd %%xmm0,%k5 \n" "pshufd $0x39,%%xmm0,%%xmm0 \n" "movd 0x00(%0,%1,1),%%xmm1 \n" "movd 0x00(%0,%5,1),%%xmm6 \n" "punpckldq %%xmm6,%%xmm1 \n" "addps %%xmm4,%%xmm2 \n" "movq %%xmm1,(%2) \n" "movd %%xmm0,%k1 \n" "pshufd $0x39,%%xmm0,%%xmm0 \n" "movd %%xmm0,%k5 \n" "movd 0x00(%0,%1,1),%%xmm0 \n" "movd 0x00(%0,%5,1),%%xmm6 \n" "punpckldq %%xmm6,%%xmm0 \n" "addps %%xmm4,%%xmm3 \n" "movq %%xmm0,0x08(%2) \n" "lea 0x10(%2),%2 \n" "sub $0x4,%4 \n" "jge 40b \n" "49: \n" "add $0x3,%4 \n" "jl 19f \n" // 1 pixel loop LABELALIGN "10: \n" "cvttps2dq %%xmm2,%%xmm0 \n" "packssdw %%xmm0,%%xmm0 \n" "pmaddwd %%xmm5,%%xmm0 \n" "addps %%xmm7,%%xmm2 \n" "movd %%xmm0,%k1 \n" "movd 0x00(%0,%1,1),%%xmm0 \n" "movd %%xmm0,(%2) \n" "lea 0x04(%2),%2 \n" "sub $0x1,%4 \n" "jge 10b \n" "19: \n" : "+r"(src_argb), // %0 "+r"(src_argb_stride_temp), // %1 "+r"(dst_argb), // %2 "+r"(src_dudv), // %3 "+rm"(width), // %4 "=&r"(temp) // %5 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBAFFINEROW_SSE2 #ifdef HAS_INTERPOLATEROW_SSSE3 // Bilinear filter 16x2 -> 16x1 void InterpolateRow_SSSE3(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { asm volatile( "sub %1,%0 \n" "cmp $0x0,%3 \n" "je 100f \n" "cmp $0x80,%3 \n" "je 50f \n" "movd %3,%%xmm0 \n" "neg %3 \n" "add $0x100,%3 \n" "movd %3,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "mov $0x80808080,%%eax \n" "movd %%eax,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" // General purpose row blend. LABELALIGN "1: \n" "movdqu (%1),%%xmm0 \n" "movdqu 0x00(%1,%4,1),%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" "psubb %%xmm4,%%xmm0 \n" "psubb %%xmm4,%%xmm1 \n" "movdqa %%xmm5,%%xmm2 \n" "movdqa %%xmm5,%%xmm3 \n" "pmaddubsw %%xmm0,%%xmm2 \n" "pmaddubsw %%xmm1,%%xmm3 \n" "paddw %%xmm4,%%xmm2 \n" "paddw %%xmm4,%%xmm3 \n" "psrlw $0x8,%%xmm2 \n" "psrlw $0x8,%%xmm3 \n" "packuswb %%xmm3,%%xmm2 \n" "movdqu %%xmm2,0x00(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "jmp 99f \n" // Blend 50 / 50. LABELALIGN "50: \n" "movdqu (%1),%%xmm0 \n" "movdqu 0x00(%1,%4,1),%%xmm1 \n" "pavgb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,0x00(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 50b \n" "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" "movdqu (%1),%%xmm0 \n" "movdqu %%xmm0,0x00(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 100b \n" "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+rm"(width), // %2 "+r"(source_y_fraction) // %3 : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_SSSE3 #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 void InterpolateRow_AVX2(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { asm volatile( "sub %1,%0 \n" "cmp $0x0,%3 \n" "je 100f \n" "cmp $0x80,%3 \n" "je 50f \n" "vmovd %3,%%xmm0 \n" "neg %3 \n" "add $0x100,%3 \n" "vmovd %3,%%xmm5 \n" "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n" "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n" "vbroadcastss %%xmm5,%%ymm5 \n" "mov $0x80808080,%%eax \n" "vmovd %%eax,%%xmm4 \n" "vbroadcastss %%xmm4,%%ymm4 \n" // General purpose row blend. LABELALIGN "1: \n" "vmovdqu (%1),%%ymm0 \n" "vmovdqu 0x00(%1,%4,1),%%ymm2 \n" "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n" "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n" "vpsubb %%ymm4,%%ymm1,%%ymm1 \n" "vpsubb %%ymm4,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n" "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n" "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,0x00(%1,%0,1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "jmp 99f \n" // Blend 50 / 50. LABELALIGN "50: \n" "vmovdqu (%1),%%ymm0 \n" "vpavgb 0x00(%1,%4,1),%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,0x00(%1,%0,1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 50b \n" "jmp 99f \n" // Blend 100 / 0 - Copy row unchanged. LABELALIGN "100: \n" "vmovdqu (%1),%%ymm0 \n" "vmovdqu %%ymm0,0x00(%1,%0,1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 100b \n" "99: \n" "vzeroupper \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(width), // %2 "+r"(source_y_fraction) // %3 : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"); } #endif // HAS_INTERPOLATEROW_AVX2 #ifdef HAS_ARGBSHUFFLEROW_SSSE3 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { asm volatile( "movdqu (%3),%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" "movdqu %%xmm0,(%1) \n" "movdqu %%xmm1,0x10(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(shuffler) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_SSSE3 #ifdef HAS_ARGBSHUFFLEROW_AVX2 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { asm volatile( "vbroadcastf128 (%3),%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(shuffler) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_AVX2 #ifdef HAS_I422TOYUY2ROW_SSE2 void I422ToYUY2Row_SSE2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { asm volatile( "sub %1,%2 \n" LABELALIGN "1: \n" "movq (%1),%%xmm2 \n" "movq 0x00(%1,%2,1),%%xmm1 \n" "add $0x8,%1 \n" "punpcklbw %%xmm1,%%xmm2 \n" "movdqu (%0),%%xmm0 \n" "add $0x10,%0 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm0 \n" "punpckhbw %%xmm2,%%xmm1 \n" "movdqu %%xmm0,(%3) \n" "movdqu %%xmm1,0x10(%3) \n" "lea 0x20(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_yuy2), // %3 "+rm"(width) // %4 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOYUY2ROW_SSE2 #ifdef HAS_I422TOUYVYROW_SSE2 void I422ToUYVYRow_SSE2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { asm volatile( "sub %1,%2 \n" LABELALIGN "1: \n" "movq (%1),%%xmm2 \n" "movq 0x00(%1,%2,1),%%xmm1 \n" "add $0x8,%1 \n" "punpcklbw %%xmm1,%%xmm2 \n" "movdqu (%0),%%xmm0 \n" "movdqa %%xmm2,%%xmm1 \n" "add $0x10,%0 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm2 \n" "movdqu %%xmm1,(%3) \n" "movdqu %%xmm2,0x10(%3) \n" "lea 0x20(%3),%3 \n" "sub $0x10,%4 \n" "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_uyvy), // %3 "+rm"(width) // %4 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOUYVYROW_SSE2 #ifdef HAS_I422TOYUY2ROW_AVX2 void I422ToYUY2Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { asm volatile( "sub %1,%2 \n" LABELALIGN "1: \n" "vpmovzxbw (%1),%%ymm1 \n" "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" "add $0x10,%1 \n" "vpsllw $0x8,%%ymm2,%%ymm2 \n" "vpor %%ymm1,%%ymm2,%%ymm2 \n" "vmovdqu (%0),%%ymm0 \n" "add $0x20,%0 \n" "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" "vextractf128 $0x0,%%ymm1,(%3) \n" "vextractf128 $0x0,%%ymm2,0x10(%3) \n" "vextractf128 $0x1,%%ymm1,0x20(%3) \n" "vextractf128 $0x1,%%ymm2,0x30(%3) \n" "lea 0x40(%3),%3 \n" "sub $0x20,%4 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_yuy2), // %3 "+rm"(width) // %4 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOYUY2ROW_AVX2 #ifdef HAS_I422TOUYVYROW_AVX2 void I422ToUYVYRow_AVX2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { asm volatile( "sub %1,%2 \n" LABELALIGN "1: \n" "vpmovzxbw (%1),%%ymm1 \n" "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" "add $0x10,%1 \n" "vpsllw $0x8,%%ymm2,%%ymm2 \n" "vpor %%ymm1,%%ymm2,%%ymm2 \n" "vmovdqu (%0),%%ymm0 \n" "add $0x20,%0 \n" "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" "vextractf128 $0x0,%%ymm1,(%3) \n" "vextractf128 $0x0,%%ymm2,0x10(%3) \n" "vextractf128 $0x1,%%ymm1,0x20(%3) \n" "vextractf128 $0x1,%%ymm2,0x30(%3) \n" "lea 0x40(%3),%3 \n" "sub $0x20,%4 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_uyvy), // %3 "+rm"(width) // %4 : : "memory", "cc", "xmm0", "xmm1", "xmm2"); } #endif // HAS_I422TOUYVYROW_AVX2 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { asm volatile( "pxor %%xmm3,%%xmm3 \n" // 2 pixel loop. LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" "lea 0x8(%0),%0 \n" "punpcklbw %%xmm3,%%xmm0 \n" "movdqa %%xmm0,%%xmm4 \n" "punpcklwd %%xmm3,%%xmm0 \n" "punpckhwd %%xmm3,%%xmm4 \n" "cvtdq2ps %%xmm0,%%xmm0 \n" "cvtdq2ps %%xmm4,%%xmm4 \n" "movdqa %%xmm0,%%xmm1 \n" "movdqa %%xmm4,%%xmm5 \n" "mulps 0x10(%3),%%xmm0 \n" "mulps 0x10(%3),%%xmm4 \n" "addps (%3),%%xmm0 \n" "addps (%3),%%xmm4 \n" "movdqa %%xmm1,%%xmm2 \n" "movdqa %%xmm5,%%xmm6 \n" "mulps %%xmm1,%%xmm2 \n" "mulps %%xmm5,%%xmm6 \n" "mulps %%xmm2,%%xmm1 \n" "mulps %%xmm6,%%xmm5 \n" "mulps 0x20(%3),%%xmm2 \n" "mulps 0x20(%3),%%xmm6 \n" "mulps 0x30(%3),%%xmm1 \n" "mulps 0x30(%3),%%xmm5 \n" "addps %%xmm2,%%xmm0 \n" "addps %%xmm6,%%xmm4 \n" "addps %%xmm1,%%xmm0 \n" "addps %%xmm5,%%xmm4 \n" "cvttps2dq %%xmm0,%%xmm0 \n" "cvttps2dq %%xmm4,%%xmm4 \n" "packuswb %%xmm4,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x2,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(poly) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { asm volatile( "vbroadcastf128 (%3),%%ymm4 \n" "vbroadcastf128 0x10(%3),%%ymm5 \n" "vbroadcastf128 0x20(%3),%%ymm6 \n" "vbroadcastf128 0x30(%3),%%ymm7 \n" // 2 pixel loop. LABELALIGN "1: \n" "vpmovzxbd (%0),%%ymm0 \n" // 2 ARGB pixels "lea 0x8(%0),%0 \n" "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * // X "vcvttps2dq %%ymm0,%%ymm0 \n" "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" "vmovq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x2,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(poly) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 #ifdef HAS_HALFFLOATROW_SSE2 static float kScaleBias = 1.9259299444e-34f; void HalfFloatRow_SSE2(const uint16_t* src, uint16_t* dst, float scale, int width) { scale *= kScaleBias; asm volatile( "movd %3,%%xmm4 \n" "pshufd $0x0,%%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm2 \n" // 8 shorts "add $0x10,%0 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats "punpckhwd %%xmm5,%%xmm3 \n" "cvtdq2ps %%xmm3,%%xmm3 \n" "mulps %%xmm4,%%xmm2 \n" "mulps %%xmm4,%%xmm3 \n" "psrld $0xd,%%xmm2 \n" "psrld $0xd,%%xmm3 \n" "packssdw %%xmm3,%%xmm2 \n" "movdqu %%xmm2,-0x10(%0,%1,1) \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "m"(scale) // %3 : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_HALFFLOATROW_SSE2 #ifdef HAS_HALFFLOATROW_AVX2 void HalfFloatRow_AVX2(const uint16_t* src, uint16_t* dst, float scale, int width) { scale *= kScaleBias; asm volatile( "vbroadcastss %3, %%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" "vmovdqu (%0),%%ymm2 \n" // 16 shorts "add $0x20,%0 \n" "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" "vcvtdq2ps %%ymm3,%%ymm3 \n" "vcvtdq2ps %%ymm2,%%ymm2 \n" "vmulps %%ymm3,%%ymm4,%%ymm3 \n" "vmulps %%ymm2,%%ymm4,%%ymm2 \n" "vpsrld $0xd,%%ymm3,%%ymm3 \n" "vpsrld $0xd,%%ymm2,%%ymm2 \n" "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates "vmovdqu %%ymm2,-0x20(%0,%1,1) \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 #if defined(__x86_64__) : "x"(scale) // %3 #else : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_HALFFLOATROW_AVX2 #ifdef HAS_HALFFLOATROW_F16C void HalfFloatRow_F16C(const uint16_t* src, uint16_t* dst, float scale, int width) { asm volatile( "vbroadcastss %3, %%ymm4 \n" "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints "vpmovzxwd 0x10(%0),%%ymm3 \n" "vcvtdq2ps %%ymm2,%%ymm2 \n" "vcvtdq2ps %%ymm3,%%ymm3 \n" "vmulps %%ymm2,%%ymm4,%%ymm2 \n" "vmulps %%ymm3,%%ymm4,%%ymm3 \n" "vcvtps2ph $3, %%ymm2, %%xmm2 \n" "vcvtps2ph $3, %%ymm3, %%xmm3 \n" "vmovdqu %%xmm2,0x00(%0,%1,1) \n" "vmovdqu %%xmm3,0x10(%0,%1,1) \n" "add $0x20,%0 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 #if defined(__x86_64__) : "x"(scale) // %3 #else : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4"); } #endif // HAS_HALFFLOATROW_F16C #ifdef HAS_HALFFLOATROW_F16C void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) { asm volatile( "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" "vpmovzxwd (%0),%%ymm2 \n" // 16 shorts -> 16 ints "vpmovzxwd 0x10(%0),%%ymm3 \n" "vcvtdq2ps %%ymm2,%%ymm2 \n" "vcvtdq2ps %%ymm3,%%ymm3 \n" "vcvtps2ph $3, %%ymm2, %%xmm2 \n" "vcvtps2ph $3, %%ymm3, %%xmm3 \n" "vmovdqu %%xmm2,0x00(%0,%1,1) \n" "vmovdqu %%xmm3,0x10(%0,%1,1) \n" "add $0x20,%0 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm2", "xmm3"); } #endif // HAS_HALFFLOATROW_F16C #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. void ARGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width) { uintptr_t pixel_temp; asm volatile( // 1 pixel loop. LABELALIGN "1: \n" "movzb (%0),%1 \n" "lea 0x4(%0),%0 \n" "movzb 0x00(%3,%1,4),%1 \n" "mov %b1,-0x4(%0) \n" "movzb -0x3(%0),%1 \n" "movzb 0x01(%3,%1,4),%1 \n" "mov %b1,-0x3(%0) \n" "movzb -0x2(%0),%1 \n" "movzb 0x02(%3,%1,4),%1 \n" "mov %b1,-0x2(%0) \n" "movzb -0x1(%0),%1 \n" "movzb 0x03(%3,%1,4),%1 \n" "mov %b1,-0x1(%0) \n" "dec %2 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "=&d"(pixel_temp), // %1 "+r"(width) // %2 : "r"(table_argb) // %3 : "memory", "cc"); } #endif // HAS_ARGBCOLORTABLEROW_X86 #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. void RGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width) { uintptr_t pixel_temp; asm volatile( // 1 pixel loop. LABELALIGN "1: \n" "movzb (%0),%1 \n" "lea 0x4(%0),%0 \n" "movzb 0x00(%3,%1,4),%1 \n" "mov %b1,-0x4(%0) \n" "movzb -0x3(%0),%1 \n" "movzb 0x01(%3,%1,4),%1 \n" "mov %b1,-0x3(%0) \n" "movzb -0x2(%0),%1 \n" "movzb 0x02(%3,%1,4),%1 \n" "mov %b1,-0x2(%0) \n" "dec %2 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "=&d"(pixel_temp), // %1 "+r"(width) // %2 : "r"(table_argb) // %3 : "memory", "cc"); } #endif // HAS_RGBCOLORTABLEROW_X86 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width, const uint8_t* luma, uint32_t lumacoeff) { uintptr_t pixel_temp; uintptr_t table_temp; asm volatile( "movd %6,%%xmm3 \n" "pshufd $0x0,%%xmm3,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "psllw $0x8,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN "1: \n" "movdqu (%2),%%xmm0 \n" "pmaddubsw %%xmm3,%%xmm0 \n" "phaddw %%xmm0,%%xmm0 \n" "pand %%xmm4,%%xmm0 \n" "punpcklwd %%xmm5,%%xmm0 \n" "movd %%xmm0,%k1 \n" // 32 bit offset "add %5,%1 \n" "pshufd $0x39,%%xmm0,%%xmm0 \n" "movzb (%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,(%3) \n" "movzb 0x1(%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,0x1(%3) \n" "movzb 0x2(%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,0x2(%3) \n" "movzb 0x3(%2),%0 \n" "mov %b0,0x3(%3) \n" "movd %%xmm0,%k1 \n" // 32 bit offset "add %5,%1 \n" "pshufd $0x39,%%xmm0,%%xmm0 \n" "movzb 0x4(%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,0x4(%3) \n" "movzb 0x5(%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,0x5(%3) \n" "movzb 0x6(%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,0x6(%3) \n" "movzb 0x7(%2),%0 \n" "mov %b0,0x7(%3) \n" "movd %%xmm0,%k1 \n" // 32 bit offset "add %5,%1 \n" "pshufd $0x39,%%xmm0,%%xmm0 \n" "movzb 0x8(%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,0x8(%3) \n" "movzb 0x9(%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,0x9(%3) \n" "movzb 0xa(%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,0xa(%3) \n" "movzb 0xb(%2),%0 \n" "mov %b0,0xb(%3) \n" "movd %%xmm0,%k1 \n" // 32 bit offset "add %5,%1 \n" "movzb 0xc(%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,0xc(%3) \n" "movzb 0xd(%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,0xd(%3) \n" "movzb 0xe(%2),%0 \n" "movzb 0x00(%1,%0,1),%0 \n" "mov %b0,0xe(%3) \n" "movzb 0xf(%2),%0 \n" "mov %b0,0xf(%3) \n" "lea 0x10(%2),%2 \n" "lea 0x10(%3),%3 \n" "sub $0x4,%4 \n" "jg 1b \n" : "=&d"(pixel_temp), // %0 "=&a"(table_temp), // %1 "+r"(src_argb), // %2 "+r"(dst_argb), // %3 "+rm"(width) // %4 : "r"(luma), // %5 "rm"(lumacoeff) // %6 : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 static const uvec8 kYUV24Shuffle[3] = { {8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12}, {9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15}, {2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7}}; // Convert biplanar NV21 to packed YUV24 // NV21 has VU in memory for chroma. // YUV24 is VUY in memory void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { asm volatile( "sub %0,%1 \n" "movdqa (%4),%%xmm4 \n" // 3 shuffler constants "movdqa 16(%4),%%xmm5 \n" "movdqa 32(%4),%%xmm6 \n" "1: \n" "movdqu (%0),%%xmm2 \n" // load 16 Y values "movdqu (%0,%1),%%xmm3 \n" // load 8 VU values "lea 16(%0),%0 \n" "movdqa %%xmm2,%%xmm0 \n" "movdqa %%xmm2,%%xmm1 \n" "shufps $0x44,%%xmm3,%%xmm0 \n" // Y 0..7, UV 0..3 "shufps $0x99,%%xmm3,%%xmm1 \n" // Y 4..11, UV 2..5 "shufps $0xee,%%xmm3,%%xmm2 \n" // Y 8..15, UV 4..7 "pshufb %%xmm4, %%xmm0 \n" // weave into YUV24 "pshufb %%xmm5, %%xmm1 \n" "pshufb %%xmm6, %%xmm2 \n" "movdqu %%xmm0,(%2) \n" "movdqu %%xmm1,16(%2) \n" "movdqu %%xmm2,32(%2) \n" "lea 48(%2),%2 \n" "sub $16,%3 \n" // 16 pixels per loop "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 "+r"(width) // %3 : "r"(&kYUV24Shuffle[0]) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } // Convert biplanar NV21 to packed YUV24 // NV21 has VU in memory for chroma. // YUV24 is VUY in memory void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { asm volatile( "sub %0,%1 \n" "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants "vbroadcastf128 16(%4),%%ymm5 \n" "vbroadcastf128 32(%4),%%ymm6 \n" "1: \n" "vmovdqu (%0),%%ymm2 \n" // load 32 Y values "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values "lea 32(%0),%0 \n" "vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3 "vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5 "vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7 "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" "vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n" "vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n" "vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm3,(%2) \n" "vmovdqu %%ymm0,32(%2) \n" "vmovdqu %%ymm1,64(%2) \n" "lea 96(%2),%2 \n" "sub $32,%3 \n" // 32 pixels per loop "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 "+r"(width) // %3 : "r"(&kYUV24Shuffle[0]) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #ifdef HAS_NV21ToYUV24ROW_AVX512 // The following VMBI VEX256 code tests okay with the intelsde emulator. static const lvec8 kYUV24Perm[3] = { {32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36, 37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43}, {10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15, 48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52}, {53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59, 26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31}}; void NV21ToYUV24Row_AVX512(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { asm volatile( "sub %0,%1 \n" "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants "vmovdqa 32(%4),%%ymm5 \n" "vmovdqa 64(%4),%%ymm6 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm2 \n" // load 32 Y values "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values "lea 32(%0),%0 \n" "vmovdqa %%ymm2, %%ymm0 \n" "vmovdqa %%ymm2, %%ymm1 \n" "vpermt2b %%ymm3,%%ymm4,%%ymm0 \n" "vpermt2b %%ymm3,%%ymm5,%%ymm1 \n" "vpermt2b %%ymm3,%%ymm6,%%ymm2 \n" "vmovdqu %%ymm0,(%2) \n" "vmovdqu %%ymm1,32(%2) \n" "vmovdqu %%ymm2,64(%2) \n" "lea 96(%2),%2 \n" "sub $32,%3 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 "+r"(width) // %3 : "r"(&kYUV24Perm[0]) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_NV21ToYUV24ROW_AVX512 #ifdef HAS_SWAPUVROW_SSSE3 // Shuffle table for reversing the bytes. static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( "movdqu %3,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" "movdqu %%xmm0,(%1) \n" "movdqu %%xmm1,0x10(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 : "m"(kShuffleUVToVU) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_SWAPUVROW_SSSE3 #ifdef HAS_SWAPUVROW_AVX2 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( "vbroadcastf128 %3,%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 : "m"(kShuffleUVToVU) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_SWAPUVROW_AVX2 void HalfMergeUVRow_SSSE3(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int width) { asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" // load 16 U values "movdqu (%1),%%xmm1 \n" // load 16 V values "movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row "movdqu 0(%1,%5,1),%%xmm3 \n" "lea 0x10(%0),%0 \n" "pmaddubsw %%xmm4,%%xmm0 \n" // half size "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" "lea 0x10(%1),%1 \n" "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm3,%%xmm1 \n" "psrlw $0x1,%%xmm0 \n" "psrlw $0x1,%%xmm1 \n" "pavgw %%xmm5,%%xmm0 \n" "pavgw %%xmm5,%%xmm1 \n" "packuswb %%xmm0,%%xmm0 \n" "packuswb %%xmm1,%%xmm1 \n" "punpcklbw %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" // store 8 UV pixels "lea 0x10(%2),%2 \n" "sub $0x10,%3 \n" // 16 src pixels per loop "jg 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 : "r"((intptr_t)(src_stride_u)), // %4 "r"((intptr_t)(src_stride_v)) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void HalfMergeUVRow_AVX2(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int width) { asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" // load 32 U values "vmovdqu (%1),%%ymm1 \n" // load 32 V values "vmovdqu 0(%0,%4,1),%%ymm2 \n" // 32 from next row "vmovdqu 0(%1,%5,1),%%ymm3 \n" "lea 0x20(%0),%0 \n" "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // half size "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" "lea 0x20(%1),%1 \n" "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" "vpsrlw $0x1,%%ymm0,%%ymm0 \n" "vpsrlw $0x1,%%ymm1,%%ymm1 \n" "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%2) \n" // store 16 UV pixels "lea 0x20(%2),%2 \n" "sub $0x20,%3 \n" // 32 src pixels per loop "jg 1b \n" "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 : "r"((intptr_t)(src_stride_u)), // %4 "r"((intptr_t)(src_stride_v)) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void ClampFloatToZero_SSE2(const float* src_x, float* dst_y, int width) { asm volatile( "pxor %%xmm1,%%xmm1 \n" LABELALIGN "1: \n" "movd (%0),%%xmm0 \n" // load float "maxss %%xmm1, %%xmm0 \n" // clamp to zero "add 4, %0 \n" "movd %%xmm0, (%1) \n" // store float "add 4, %1 \n" "sub $0x4,%2 \n" // 1 float per loop "jg 1b \n" : "+r"(src_x), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "memory", "cc", "xmm0", "xmm1"); } #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/row_mmi.cc000066400000000000000000016462651416500237200210320ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/row.h" #include // For memcpy and memset. #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for Mips MMI. #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) // clang-format off void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { uint64_t src0, src1, dest; const uint64_t mask = 0xff000000ULL; __asm__ volatile( "1: \n\t" "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" "or %[src0], %[src0], %[mask] \n\t" "or %[src1], %[src1], %[mask] \n\t" "punpcklwd %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" "or %[src0], %[src0], %[mask] \n\t" "or %[src1], %[src1], %[mask] \n\t" "punpcklwd %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) : [src_ptr] "r"(src_rgb24), [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask) : "memory"); } void RAWToARGBRow_MMI(const uint8_t* src_raw, uint8_t* dst_argb, int width) { uint64_t src0, src1, dest; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0xff000000ULL; const uint64_t mask2 = 0xc6; __asm__ volatile( "1: \n\t" "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" "or %[src0], %[src0], %[mask1] \n\t" "punpcklbh %[src0], %[src0], %[mask0] \n\t" "pshufh %[src0], %[src0], %[mask2] \n\t" "or %[src1], %[src1], %[mask1] \n\t" "punpcklbh %[src1], %[src1], %[mask0] \n\t" "pshufh %[src1], %[src1], %[mask2] \n\t" "packushb %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "gslwlc1 %[src0], 0x09(%[src_ptr]) \n\t" "gslwrc1 %[src0], 0x06(%[src_ptr]) \n\t" "gslwlc1 %[src1], 0x0c(%[src_ptr]) \n\t" "gslwrc1 %[src1], 0x09(%[src_ptr]) \n\t" "or %[src0], %[src0], %[mask1] \n\t" "punpcklbh %[src0], %[src0], %[mask0] \n\t" "pshufh %[src0], %[src0], %[mask2] \n\t" "or %[src1], %[src1], %[mask1] \n\t" "punpcklbh %[src1], %[src1], %[mask0] \n\t" "pshufh %[src1], %[src1], %[mask2] \n\t" "packushb %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) : [src_ptr] "r"(src_raw), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [width] "r"(width) : "memory"); } void RAWToRGB24Row_MMI(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { uint64_t src0, src1; uint64_t ftmp[4]; uint64_t mask0 = 0xc6; uint64_t mask1 = 0x6c; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_raw]) \n\t" "gsldlc1 %[src0], 0x07(%[src_raw]) \n\t" "gslwrc1 %[src1], 0x08(%[src_raw]) \n\t" "gslwlc1 %[src1], 0x0b(%[src_raw]) \n\t" "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" "punpcklbh %[src1], %[src1], %[zero] \n\t" "pextrh %[ftmp2], %[ftmp0], %[three] \n\t" "pextrh %[ftmp3], %[ftmp1], %[one] \n\t" "pinsrh_3 %[ftmp0], %[ftmp0], %[ftmp3] \n\t" "pextrh %[ftmp3], %[ftmp1], %[two] \n\t" "pinsrh_1 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" "pshufh %[src1], %[src1], %[mask1] \n\t" "pextrh %[ftmp2], %[src1], %[zero] \n\t" "pinsrh_2 %[ftmp1], %[ftmp1], %[ftmp2] \n\t" "pinsrh_0 %[src1], %[src1], %[ftmp3] \n\t" "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" "packushb %[src1], %[src1], %[zero] \n\t" "gssdrc1 %[ftmp0], 0x00(%[dst_rgb24]) \n\t" "gssdlc1 %[ftmp0], 0x07(%[dst_rgb24]) \n\t" "gsswrc1 %[src1], 0x08(%[dst_rgb24]) \n\t" "gsswlc1 %[src1], 0x0b(%[dst_rgb24]) \n\t" "daddiu %[src_raw], %[src_raw], 0x0c \n\t" "daddiu %[dst_rgb24], %[dst_rgb24], 0x0c \n\t" "daddiu %[width], %[width], -0x04 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]) : [src_raw] "r"(src_raw), [dst_rgb24] "r"(dst_rgb24), [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), [one] "f"(0x01), [two] "f"(0x02), [three] "f"(0x03) : "memory"); } void RGB565ToARGBRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { uint64_t ftmp[5]; uint64_t c0 = 0x001f001f001f001f; uint64_t c1 = 0x00ff00ff00ff00ff; uint64_t c2 = 0x0007000700070007; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" "psrlh %[src1], %[src0], %[eight] \n\t" "and %[b], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g], %[src1], %[c2] \n\t" "psllh %[g], %[g], %[three] \n\t" "or %[g], %[src0], %[g] \n\t" "psrlh %[r], %[src1], %[three] \n\t" "psllh %[src0], %[b], %[three] \n\t" "psrlh %[src1], %[b], %[two] \n\t" "or %[b], %[src0], %[src1] \n\t" "psllh %[src0], %[g], %[two] \n\t" "psrlh %[src1], %[g], %[four] \n\t" "or %[g], %[src0], %[src1] \n\t" "psllh %[src0], %[r], %[three] \n\t" "psrlh %[src1], %[r], %[two] \n\t" "or %[r], %[src0], %[src1] \n\t" "packushb %[b], %[b], %[r] \n\t" "packushb %[g], %[g], %[c1] \n\t" "punpcklbh %[src0], %[b], %[g] \n\t" "punpckhbh %[src1], %[b], %[g] \n\t" "punpcklhw %[r], %[src0], %[src1] \n\t" "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" "punpckhhw %[r], %[src0], %[src1] \n\t" "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" "daddiu %[src_rgb565], %[src_rgb565], 0x08 \n\t" "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" "daddiu %[width], %[width], -0x04 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]) : [src_rgb565] "r"(src_rgb565), [dst_argb] "r"(dst_argb), [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04) : "memory"); } void ARGB1555ToARGBRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { uint64_t ftmp[6]; uint64_t c0 = 0x001f001f001f001f; uint64_t c1 = 0x00ff00ff00ff00ff; uint64_t c2 = 0x0003000300030003; uint64_t c3 = 0x007c007c007c007c; uint64_t c4 = 0x0001000100010001; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" "psrlh %[src1], %[src0], %[eight] \n\t" "and %[b], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g], %[src1], %[c2] \n\t" "psllh %[g], %[g], %[three] \n\t" "or %[g], %[src0], %[g] \n\t" "and %[r], %[src1], %[c3] \n\t" "psrlh %[r], %[r], %[two] \n\t" "psrlh %[a], %[src1], %[seven] \n\t" "psllh %[src0], %[b], %[three] \n\t" "psrlh %[src1], %[b], %[two] \n\t" "or %[b], %[src0], %[src1] \n\t" "psllh %[src0], %[g], %[three] \n\t" "psrlh %[src1], %[g], %[two] \n\t" "or %[g], %[src0], %[src1] \n\t" "psllh %[src0], %[r], %[three] \n\t" "psrlh %[src1], %[r], %[two] \n\t" "or %[r], %[src0], %[src1] \n\t" "xor %[a], %[a], %[c1] \n\t" "paddb %[a], %[a], %[c4] \n\t" "packushb %[b], %[b], %[r] \n\t" "packushb %[g], %[g], %[a] \n\t" "punpcklbh %[src0], %[b], %[g] \n\t" "punpckhbh %[src1], %[b], %[g] \n\t" "punpcklhw %[r], %[src0], %[src1] \n\t" "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" "punpckhhw %[r], %[src0], %[src1] \n\t" "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" "daddiu %[src_argb1555], %[src_argb1555], 0x08 \n\t" "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" "daddiu %[width], %[width], -0x04 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) : [src_argb1555] "r"(src_argb1555), [dst_argb] "r"(dst_argb), [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [c4] "f"(c4), [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) : "memory"); } void ARGB4444ToARGBRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { uint64_t ftmp[6]; uint64_t c0 = 0x000f000f000f000f; uint64_t c1 = 0x00ff00ff00ff00ff; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" "psrlh %[src1], %[src0], %[eight] \n\t" "and %[b], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[g], %[src0], %[four] \n\t" "and %[r], %[src1], %[c0] \n\t" "psrlh %[a], %[src1], %[four] \n\t" "psllh %[src0], %[b], %[four] \n\t" "or %[b], %[src0], %[b] \n\t" "psllh %[src0], %[g], %[four] \n\t" "or %[g], %[src0], %[g] \n\t" "psllh %[src0], %[r], %[four] \n\t" "or %[r], %[src0], %[r] \n\t" "psllh %[src0], %[a], %[four] \n\t" "or %[a], %[src0], %[a] \n\t" "packushb %[b], %[b], %[r] \n\t" "packushb %[g], %[g], %[a] \n\t" "punpcklbh %[src0], %[b], %[g] \n\t" "punpckhbh %[src1], %[b], %[g] \n\t" "punpcklhw %[r], %[src0], %[src1] \n\t" "gssdrc1 %[r], 0x00(%[dst_argb]) \n\t" "gssdlc1 %[r], 0x07(%[dst_argb]) \n\t" "punpckhhw %[r], %[src0], %[src1] \n\t" "gssdrc1 %[r], 0x08(%[dst_argb]) \n\t" "gssdlc1 %[r], 0x0f(%[dst_argb]) \n\t" "daddiu %[src_argb4444], %[src_argb4444], 0x08 \n\t" "daddiu %[dst_argb], %[dst_argb], 0x10 \n\t" "daddiu %[width], %[width], -0x04 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b] "=&f"(ftmp[2]), [g] "=&f"(ftmp[3]), [r] "=&f"(ftmp[4]), [a] "=&f"(ftmp[5]) : [src_argb4444] "r"(src_argb4444), [dst_argb] "r"(dst_argb), [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04) : "memory"); } void ARGBToRGB24Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint64_t src; __asm__ volatile( "1: \n\t" "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" "gsswlc1 %[src], 0x03(%[dst_ptr]) \n\t" "gsswrc1 %[src], 0x00(%[dst_ptr]) \n\t" "gslwlc1 %[src], 0x07(%[src_ptr]) \n\t" "gslwrc1 %[src], 0x04(%[src_ptr]) \n\t" "gsswlc1 %[src], 0x06(%[dst_ptr]) \n\t" "gsswrc1 %[src], 0x03(%[dst_ptr]) \n\t" "gslwlc1 %[src], 0x0b(%[src_ptr]) \n\t" "gslwrc1 %[src], 0x08(%[src_ptr]) \n\t" "gsswlc1 %[src], 0x09(%[dst_ptr]) \n\t" "gsswrc1 %[src], 0x06(%[dst_ptr]) \n\t" "gslwlc1 %[src], 0x0f(%[src_ptr]) \n\t" "gslwrc1 %[src], 0x0c(%[src_ptr]) \n\t" "gsswlc1 %[src], 0x0c(%[dst_ptr]) \n\t" "gsswrc1 %[src], 0x09(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src) : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_rgb), [width] "r"(width) : "memory"); } void ARGBToRAWRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint64_t src0, src1; uint64_t ftmp[3]; uint64_t mask0 = 0xc6; uint64_t mask1 = 0x18; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" "punpcklbh %[ftmp0], %[src0], %[zero] \n\t" "pshufh %[ftmp0], %[ftmp0], %[mask0] \n\t" "punpckhbh %[ftmp1], %[src0], %[zero] \n\t" "punpcklbh %[ftmp2], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" "pextrh %[src0], %[ftmp1], %[two] \n\t" "pinsrh_3 %[ftmp0], %[ftmp0], %[src0] \n\t" "pshufh %[ftmp1], %[ftmp1], %[one] \n\t" "pextrh %[src0], %[ftmp2], %[two] \n\t" "pinsrh_2 %[ftmp1], %[ftmp1], %[src0] \n\t" "pextrh %[src0], %[ftmp2], %[one] \n\t" "pinsrh_3 %[ftmp1], %[ftmp1], %[src0] \n\t" "pextrh %[src0], %[ftmp2], %[zero] \n\t" "pshufh %[src1], %[src1], %[mask1] \n\t" "pinsrh_0 %[src1], %[src1], %[src0] \n\t" "packushb %[ftmp0], %[ftmp0], %[ftmp1] \n\t" "packushb %[src1], %[src1], %[zero] \n\t" "gssdrc1 %[ftmp0], 0x00(%[dst_rgb]) \n\t" "gssdlc1 %[ftmp0], 0x07(%[dst_rgb]) \n\t" "gsswrc1 %[src1], 0x08(%[dst_rgb]) \n\t" "gsswlc1 %[src1], 0x0b(%[dst_rgb]) \n\t" "daddiu %[src_argb], %[src_argb], 0x10 \n\t" "daddiu %[dst_rgb], %[dst_rgb], 0x0c \n\t" "daddiu %[width], %[width], -0x04 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]) : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1), [zero] "f"(0x00), [one] "f"(0x01), [two] "f"(0x02) : "memory"); } void ARGBToRGB565Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint64_t src0, src1; uint64_t ftmp[3]; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" "punpcklbh %[b], %[src0], %[src1] \n\t" "punpckhbh %[g], %[src0], %[src1] \n\t" "punpcklbh %[src0], %[b], %[g] \n\t" "punpckhbh %[src1], %[b], %[g] \n\t" "punpcklbh %[b], %[src0], %[zero] \n\t" "punpckhbh %[g], %[src0], %[zero] \n\t" "punpcklbh %[r], %[src1], %[zero] \n\t" "psrlh %[b], %[b], %[three] \n\t" "psrlh %[g], %[g], %[two] \n\t" "psrlh %[r], %[r], %[three] \n\t" "psllh %[g], %[g], %[five] \n\t" "psllh %[r], %[r], %[eleven] \n\t" "or %[b], %[b], %[g] \n\t" "or %[b], %[b], %[r] \n\t" "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" "daddiu %[src_argb], %[src_argb], 0x10 \n\t" "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" "daddiu %[width], %[width], -0x04 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b) : "memory"); } // dither4 is a row of 4 values from 4x4 dither matrix. // The 4x4 matrix contains values to increase RGB. When converting to // fewer bits (565) this provides an ordered dither. // The order in the 4x4 matrix in first byte is upper left. // The 4 values are passed as an int, then referenced as an array, so // endian will not affect order of the original matrix. But the dither4 // will containing the first pixel in the lower byte for little endian // or the upper byte for big endian. void ARGBToRGB565DitherRow_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width) { uint64_t src0, src1; uint64_t ftmp[3]; uint64_t c0 = 0x00ff00ff00ff00ff; __asm__ volatile( "punpcklbh %[dither], %[dither], %[zero] \n\t" "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" "punpcklbh %[b], %[src0], %[src1] \n\t" "punpckhbh %[g], %[src0], %[src1] \n\t" "punpcklbh %[src0], %[b], %[g] \n\t" "punpckhbh %[src1], %[b], %[g] \n\t" "punpcklbh %[b], %[src0], %[zero] \n\t" "punpckhbh %[g], %[src0], %[zero] \n\t" "punpcklbh %[r], %[src1], %[zero] \n\t" "paddh %[b], %[b], %[dither] \n\t" "paddh %[g], %[g], %[dither] \n\t" "paddh %[r], %[r], %[dither] \n\t" "pcmpgth %[src0], %[b], %[c0] \n\t" "or %[src0], %[src0], %[b] \n\t" "and %[b], %[src0], %[c0] \n\t" "pcmpgth %[src0], %[g], %[c0] \n\t" "or %[src0], %[src0], %[g] \n\t" "and %[g], %[src0], %[c0] \n\t" "pcmpgth %[src0], %[r], %[c0] \n\t" "or %[src0], %[src0], %[r] \n\t" "and %[r], %[src0], %[c0] \n\t" "psrlh %[b], %[b], %[three] \n\t" "psrlh %[g], %[g], %[two] \n\t" "psrlh %[r], %[r], %[three] \n\t" "psllh %[g], %[g], %[five] \n\t" "psllh %[r], %[r], %[eleven] \n\t" "or %[b], %[b], %[g] \n\t" "or %[b], %[b], %[r] \n\t" "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" "daddiu %[src_argb], %[src_argb], 0x10 \n\t" "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" "daddiu %[width], %[width], -0x04 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]) : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), [dither] "f"(dither4), [c0] "f"(c0), [zero] "f"(0x00), [two] "f"(0x02), [three] "f"(0x03), [five] "f"(0x05), [eleven] "f"(0x0b) : "memory"); } void ARGBToARGB1555Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint64_t src0, src1; uint64_t ftmp[4]; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" "punpcklbh %[b], %[src0], %[src1] \n\t" "punpckhbh %[g], %[src0], %[src1] \n\t" "punpcklbh %[src0], %[b], %[g] \n\t" "punpckhbh %[src1], %[b], %[g] \n\t" "punpcklbh %[b], %[src0], %[zero] \n\t" "punpckhbh %[g], %[src0], %[zero] \n\t" "punpcklbh %[r], %[src1], %[zero] \n\t" "punpckhbh %[a], %[src1], %[zero] \n\t" "psrlh %[b], %[b], %[three] \n\t" "psrlh %[g], %[g], %[three] \n\t" "psrlh %[r], %[r], %[three] \n\t" "psrlh %[a], %[a], %[seven] \n\t" "psllh %[g], %[g], %[five] \n\t" "psllh %[r], %[r], %[ten] \n\t" "psllh %[a], %[a], %[fifteen] \n\t" "or %[b], %[b], %[g] \n\t" "or %[b], %[b], %[r] \n\t" "or %[b], %[b], %[a] \n\t" "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" "daddiu %[src_argb], %[src_argb], 0x10 \n\t" "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" "daddiu %[width], %[width], -0x04 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), [zero] "f"(0x00), [three] "f"(0x03), [five] "f"(0x05), [seven] "f"(0x07), [ten] "f"(0x0a), [fifteen] "f"(0x0f) : "memory"); } void ARGBToARGB4444Row_MMI(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { uint64_t src0, src1; uint64_t ftmp[4]; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_argb]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_argb]) \n\t" "punpcklbh %[b], %[src0], %[src1] \n\t" "punpckhbh %[g], %[src0], %[src1] \n\t" "punpcklbh %[src0], %[b], %[g] \n\t" "punpckhbh %[src1], %[b], %[g] \n\t" "punpcklbh %[b], %[src0], %[zero] \n\t" "punpckhbh %[g], %[src0], %[zero] \n\t" "punpcklbh %[r], %[src1], %[zero] \n\t" "punpckhbh %[a], %[src1], %[zero] \n\t" "psrlh %[b], %[b], %[four] \n\t" "psrlh %[g], %[g], %[four] \n\t" "psrlh %[r], %[r], %[four] \n\t" "psrlh %[a], %[a], %[four] \n\t" "psllh %[g], %[g], %[four] \n\t" "psllh %[r], %[r], %[eight] \n\t" "psllh %[a], %[a], %[twelve] \n\t" "or %[b], %[b], %[g] \n\t" "or %[b], %[b], %[r] \n\t" "or %[b], %[b], %[a] \n\t" "gssdrc1 %[b], 0x00(%[dst_rgb]) \n\t" "gssdlc1 %[b], 0x07(%[dst_rgb]) \n\t" "daddiu %[src_argb], %[src_argb], 0x10 \n\t" "daddiu %[dst_rgb], %[dst_rgb], 0x08 \n\t" "daddiu %[width], %[width], -0x04 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [b] "=&f"(ftmp[0]), [g] "=&f"(ftmp[1]), [r] "=&f"(ftmp[2]), [a] "=&f"(ftmp[3]) : [src_argb] "r"(src_argb), [dst_rgb] "r"(dst_rgb), [width] "r"(width), [zero] "f"(0x00), [four] "f"(0x04), [eight] "f"(0x08), [twelve] "f"(0x0c) : "memory"); } void ARGBToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; const uint64_t mask = 0x0001004200810019; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" "paddw %[dest3], %[dest3], %[src] \n\t" "psrlw %[dest3], %[dest3], %[eight] \n\t" "packsswh %[src_lo], %[dest0], %[dest1] \n\t" "packsswh %[src_hi], %[dest2], %[dest3] \n\t" "packushb %[dest0], %[src_lo], %[src_hi] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } void ARGBToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; uint64_t ftmp[13]; uint64_t tmp[1]; const uint64_t value = 0x4040; const uint64_t mask_u = 0x0013002500380002; const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" "psubw %[dest0_u], %[src0], %[src1] \n\t" "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" "psubw %[dest1_u], %[src0], %[src1] \n\t" "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" "psubw %[dest2_u], %[src0], %[src1] \n\t" "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" "psubw %[dest3_u], %[src0], %[src1] \n\t" "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" "psubw %[dest3_v], %[src1], %[src0] \n\t" "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" "packushb %[dest0_u], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" "packushb %[dest0_v], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" "bgtz %[width], 1b \n\t" : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; const uint64_t mask = 0x0019008100420001; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" "paddw %[dest3], %[dest3], %[src] \n\t" "psrlw %[dest3], %[dest3], %[eight] \n\t" "packsswh %[src_lo], %[dest0], %[dest1] \n\t" "packsswh %[src_hi], %[dest2], %[dest3] \n\t" "packushb %[dest0], %[src_lo], %[src_hi] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } void BGRAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; uint64_t ftmp[13]; uint64_t tmp[1]; const uint64_t value = 0x4040; const uint64_t mask_u = 0x0002003800250013; const uint64_t mask_v = 0x0009002f00380002; __asm__ volatile( "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_0 %[dest0_v], %[src0], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" "psubw %[dest0_u], %[src1], %[src0] \n\t" "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_0 %[dest1_v], %[src0], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" "psubw %[dest1_u], %[src1], %[src0] \n\t" "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_0 %[dest2_v], %[src0], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" "psubw %[dest2_u], %[src1], %[src0] \n\t" "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_0 %[dest3_v], %[src0], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsrl %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_0 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" "psubw %[dest3_u], %[src1], %[src0] \n\t" "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" "psubw %[dest3_v], %[src0], %[src1] \n\t" "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" "packushb %[dest0_u], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" "packushb %[dest0_v], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" "bgtz %[width], 1b \n\t" : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; const uint64_t mask = 0x0001001900810042; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" "paddw %[dest3], %[dest3], %[src] \n\t" "psrlw %[dest3], %[dest3], %[eight] \n\t" "packsswh %[src_lo], %[dest0], %[dest1] \n\t" "packsswh %[src_hi], %[dest2], %[dest3] \n\t" "packushb %[dest0], %[src_lo], %[src_hi] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } void ABGRToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; uint64_t ftmp[13]; uint64_t tmp[1]; const uint64_t value = 0x4040; const uint64_t mask_u = 0x0002003800250013; const uint64_t mask_v = 0x0009002F00380002; __asm__ volatile( "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" "dsll %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" "psubw %[dest0_u], %[src1], %[src0] \n\t" "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" "dsll %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" "psubw %[dest1_u], %[src1], %[src0] \n\t" "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" "dsll %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" "psubw %[dest2_u], %[src1], %[src0] \n\t" "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" "dsll %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" "psubw %[dest3_u], %[src1], %[src0] \n\t" "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" "psubw %[dest3_v], %[src0], %[src1] \n\t" "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" "packushb %[dest0_u], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" "packushb %[dest0_v], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" "bgtz %[width], 1b \n\t" : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; const uint64_t mask = 0x0042008100190001; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" "gsldlc1 %[src], 0x0f(%[src_argb]) \n\t" "gsldrc1 %[src], 0x08(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" "gsldlc1 %[src], 0x17(%[src_argb]) \n\t" "gsldrc1 %[src], 0x10(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" "gsldlc1 %[src], 0x1f(%[src_argb]) \n\t" "gsldrc1 %[src], 0x18(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" "paddw %[dest3], %[dest3], %[src] \n\t" "psrlw %[dest3], %[dest3], %[eight] \n\t" "packsswh %[src_lo], %[dest0], %[dest1] \n\t" "packsswh %[src_hi], %[dest2], %[dest3] \n\t" "packushb %[dest0], %[src_lo], %[src_hi] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } void RGBAToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; uint64_t ftmp[13]; uint64_t tmp[1]; const uint64_t value = 0x4040; const uint64_t mask_u = 0x0013002500380002; const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest0_u], %[src0], %[value] \n\t" "dsrl %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest0_v], %[dest0_v], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" "psubw %[dest0_u], %[src0], %[src1] \n\t" "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest1_u], %[src0], %[value] \n\t" "dsrl %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest1_v], %[dest1_v], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" "psubw %[dest1_u], %[src0], %[src1] \n\t" "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest2_u], %[src0], %[value] \n\t" "dsrl %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest2_v], %[dest2_v], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" "psubw %[dest2_u], %[src0], %[src1] \n\t" "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[dest3_u], %[src0], %[value] \n\t" "dsrl %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_3 %[dest3_v], %[dest3_v], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_0 %[src_lo], %[src0], %[value] \n\t" "dsrl %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" "psubw %[dest3_u], %[src0], %[src1] \n\t" "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" "psubw %[dest3_v], %[src1], %[src0] \n\t" "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" "packushb %[dest0_u], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" "packushb %[dest0_v], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" "bgtz %[width], 1b \n\t" : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; const uint64_t mask = 0x0001004200810019; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "dsll %[src], %[src], %[eight] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "dsll %[src], %[src], %[eight] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "dsll %[src], %[src], %[eight] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "dsll %[src], %[src], %[eight] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" "paddw %[dest3], %[dest3], %[src] \n\t" "psrlw %[dest3], %[dest3], %[eight] \n\t" "packsswh %[src_lo], %[dest0], %[dest1] \n\t" "packsswh %[src_hi], %[dest2], %[dest3] \n\t" "packushb %[dest0], %[src_lo], %[src_hi] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" "daddiu %[src_argb], %[src_argb], 0x18 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } void RGB24ToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; uint64_t ftmp[13]; uint64_t tmp[1]; const uint64_t value = 0x4040; const uint64_t mask_u = 0x0013002500380002; const uint64_t mask_v = 0x00020038002f0009; __asm__ volatile( "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" "psubw %[dest0_u], %[src0], %[src1] \n\t" "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" "psubw %[dest1_u], %[src0], %[src1] \n\t" "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" "psubw %[dest2_u], %[src0], %[src1] \n\t" "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" "psubw %[dest3_u], %[src0], %[src1] \n\t" "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" "psubw %[dest3_v], %[src1], %[src0] \n\t" "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" "packushb %[dest0_u], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" "packushb %[dest0_v], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" "bgtz %[width], 1b \n\t" : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest0, dest1, dest2, dest3; const uint64_t value = 0x1080; const uint64_t mask = 0x0001001900810042; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_argb]) \n\t" "gsldrc1 %[src], 0x00(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "dsll %[src], %[src], %[eight] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest0], %[src_lo], %[src_hi] \n\t" "paddw %[dest0], %[dest0], %[src] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" "gsldlc1 %[src], 0x0d(%[src_argb]) \n\t" "gsldrc1 %[src], 0x06(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "dsll %[src], %[src], %[eight] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest1], %[src_lo], %[src_hi] \n\t" "paddw %[dest1], %[dest1], %[src] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" "gsldlc1 %[src], 0x13(%[src_argb]) \n\t" "gsldrc1 %[src], 0x0c(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "dsll %[src], %[src], %[eight] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest2], %[src_lo], %[src_hi] \n\t" "paddw %[dest2], %[dest2], %[src] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" "gsldlc1 %[src], 0x19(%[src_argb]) \n\t" "gsldrc1 %[src], 0x12(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src], %[zero] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "dsll %[src], %[src], %[eight] \n\t" "punpckhbh %[src_hi], %[src], %[zero] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src], %[src_lo], %[src_hi] \n\t" "punpckhwd %[dest3], %[src_lo], %[src_hi] \n\t" "paddw %[dest3], %[dest3], %[src] \n\t" "psrlw %[dest3], %[dest3], %[eight] \n\t" "packsswh %[src_lo], %[dest0], %[dest1] \n\t" "packsswh %[src_hi], %[dest2], %[dest3] \n\t" "packushb %[dest0], %[src_lo], %[src_hi] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" "daddiu %[src_argb], %[src_argb], 0x18 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3) : [src_argb] "r"(src_argb), [dst_y] "r"(dst_y), [width] "r"(width), [mask] "f"(mask), [value] "f"(value), [eight] "f"(0x08), [zero] "f"(0x00) : "memory"); } void RAWToUVRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; uint64_t ftmp[13]; uint64_t tmp[1]; const uint64_t value = 0x4040; const uint64_t mask_u = 0x0002003800250013; const uint64_t mask_v = 0x0009002f00380002; __asm__ volatile( "dli %[tmp0], 0x0001000100010001 \n\t" "dmtc1 %[tmp0], %[ftmp12] \n\t" "1: \n\t" "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest0_u], %[src0], %[value] \n\t" "dsll %[dest0_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_v], %[dest0_v], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x06(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x0d(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x06(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0d(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" "psubw %[dest0_u], %[src1], %[src0] \n\t" "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" "psubw %[dest0_v], %[src0], %[src1] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x0c(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x13(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x0c(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x13(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest1_u], %[src0], %[value] \n\t" "dsll %[dest1_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_v], %[dest1_v], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x12(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x19(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x12(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x19(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" "psubw %[dest1_u], %[src1], %[src0] \n\t" "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" "psubw %[dest1_v], %[src0], %[src1] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest2_u], %[src0], %[value] \n\t" "dsll %[dest2_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_v], %[dest2_v], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x1e(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x25(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x1e(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x25(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" "psubw %[dest2_u], %[src1], %[src0] \n\t" "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" "psubw %[dest2_v], %[src0], %[src1] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" "gsldrc1 %[src0], 0x24(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x2b(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x24(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2b(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[dest3_u], %[src0], %[value] \n\t" "dsll %[dest3_v], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_v], %[dest3_v], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x2a(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x31(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x2a(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x31(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "dsll %[src0], %[src0], %[eight] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src_hi] \n\t" "punpcklbh %[src_lo], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_lo] \n\t" "dsll %[src1], %[src1], %[eight] \n\t" "punpckhbh %[src_hi], %[src1], %[zero] \n\t" "paddh %[src0], %[src0], %[src_hi] \n\t" "paddh %[src0], %[src0], %[ftmp12] \n\t" "psrlh %[src0], %[src0], %[one] \n\t" "pinsrh_3 %[src_lo], %[src0], %[value] \n\t" "dsll %[src_hi], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" "psubw %[dest3_u], %[src1], %[src0] \n\t" "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" "psubw %[dest3_v], %[src0], %[src1] \n\t" "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" "packushb %[dest0_u], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" "packushb %[dest0_v], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" "daddiu %[src_rgb], %[src_rgb], 0x30 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" "bgtz %[width], 1b \n\t" : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]) : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [one] "f"(0x01), [sixteen] "f"(0x10) : "memory"); } void ARGBToYJRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width) { uint64_t src, src_hi, src_lo; uint64_t dest, dest0, dest1, dest2, dest3; uint64_t tmp0, tmp1; const uint64_t shift = 0x08; const uint64_t value = 0x80; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x0001004D0096001DULL; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "punpcklbh %[src_lo], %[src], %[mask0] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" "punpckhbh %[src_hi], %[src], %[mask0] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" "paddw %[dest0], %[tmp0], %[tmp1] \n\t" "psrlw %[dest0], %[dest0], %[shift] \n\t" "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" "punpcklbh %[src_lo], %[src], %[mask0] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" "punpckhbh %[src_hi], %[src], %[mask0] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" "paddw %[dest1], %[tmp0], %[tmp1] \n\t" "psrlw %[dest1], %[dest1], %[shift] \n\t" "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" "punpcklbh %[src_lo], %[src], %[mask0] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" "punpckhbh %[src_hi], %[src], %[mask0] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" "paddw %[dest2], %[tmp0], %[tmp1] \n\t" "psrlw %[dest2], %[dest2], %[shift] \n\t" "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" "punpcklbh %[src_lo], %[src], %[mask0] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask1] \n\t" "punpckhbh %[src_hi], %[src], %[mask0] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask1] \n\t" "punpcklwd %[tmp0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[tmp1], %[src_lo], %[src_hi] \n\t" "paddw %[dest3], %[tmp0], %[tmp1] \n\t" "psrlw %[dest3], %[dest3], %[shift] \n\t" "packsswh %[tmp0], %[dest0], %[dest1] \n\t" "packsswh %[tmp1], %[dest2], %[dest3] \n\t" "packushb %[dest], %[tmp0], %[tmp1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1) : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_y), [mask0] "f"(mask0), [mask1] "f"(mask1), [shift] "f"(shift), [value] "f"(value), [width] "r"(width) : "memory"); } void ARGBToUVJRow_MMI(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src_rgb1; uint64_t ftmp[12]; const uint64_t value = 0x4040; const uint64_t mask_u = 0x0015002a003f0002; const uint64_t mask_v = 0x0002003f0035000a; __asm__ volatile( "1: \n\t" "daddu %[src_rgb1], %[src_rgb], %[src_stride_rgb] \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x00(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x07(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src0] \n\t" "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest0_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src0], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x08(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x08(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src0] \n\t" "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" "psubw %[dest0_u], %[src0], %[src1] \n\t" "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x10(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x17(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x10(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x17(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src0] \n\t" "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest1_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src0], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x18(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x1f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x18(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src0] \n\t" "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" "psubw %[dest1_u], %[src0], %[src1] \n\t" "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "gsldrc1 %[src0], 0x20(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x27(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x20(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x27(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src0] \n\t" "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest2_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src0], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x28(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x2f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x28(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x2f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src0] \n\t" "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" "psubw %[dest2_u], %[src0], %[src1] \n\t" "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" "gsldrc1 %[src0], 0x30(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x37(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x30(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x37(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src0] \n\t" "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[dest3_u], %[src0], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src0], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" "gsldrc1 %[src0], 0x38(%[src_rgb]) \n\t" "gsldlc1 %[src0], 0x3f(%[src_rgb]) \n\t" "gsldrc1 %[src1], 0x38(%[src_rgb1]) \n\t" "gsldlc1 %[src1], 0x3f(%[src_rgb1]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "punpcklbh %[src0], %[src1], %[zero] \n\t" "punpckhbh %[src1], %[src1], %[zero] \n\t" "paddh %[src0], %[src_lo], %[src0] \n\t" "paddh %[src1], %[src_hi], %[src1] \n\t" "pavgh %[src0], %[src0], %[src1] \n\t" "dsll %[src_lo], %[src0], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src0], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" "psubw %[dest3_u], %[src0], %[src1] \n\t" "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" "psubw %[dest3_v], %[src1], %[src0] \n\t" "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" "packushb %[dest0_u], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" "packushb %[dest0_v], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" "daddiu %[src_rgb], %[src_rgb], 0x40 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x10 \n\t" "bgtz %[width], 1b \n\t" : [src_rgb1] "=&r"(src_rgb1), [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) : [src_rgb] "r"(src_rgb), [src_stride_rgb] "r"(src_stride_rgb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [eight] "f"(0x08), [sixteen] "f"(0x10) : "memory"); } void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { uint64_t ftmp[11]; const uint64_t value = 0x1080108010801080; const uint64_t mask = 0x0001004200810019; uint64_t c0 = 0x001f001f001f001f; uint64_t c1 = 0x00ff00ff00ff00ff; uint64_t c2 = 0x0007000700070007; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" "psrlh %[src1], %[src0], %[eight] \n\t" "and %[b], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g], %[src1], %[c2] \n\t" "psllh %[g], %[g], %[three] \n\t" "or %[g], %[src0], %[g] \n\t" "psrlh %[r], %[src1], %[three] \n\t" "psllh %[src0], %[b], %[three] \n\t" "psrlh %[src1], %[b], %[two] \n\t" "or %[b], %[src0], %[src1] \n\t" "psllh %[src0], %[g], %[two] \n\t" "psrlh %[src1], %[g], %[four] \n\t" "or %[g], %[src0], %[src1] \n\t" "psllh %[src0], %[r], %[three] \n\t" "psrlh %[src1], %[r], %[two] \n\t" "or %[r], %[src0], %[src1] \n\t" "punpcklhw %[src0], %[b], %[r] \n\t" "punpcklhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest0], %[src0], %[src1] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" "punpckhhw %[src0], %[b], %[r] \n\t" "punpckhhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest1], %[src0], %[src1] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" "psrlh %[src1], %[src0], %[eight] \n\t" "and %[b], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g], %[src1], %[c2] \n\t" "psllh %[g], %[g], %[three] \n\t" "or %[g], %[src0], %[g] \n\t" "psrlh %[r], %[src1], %[three] \n\t" "psllh %[src0], %[b], %[three] \n\t" "psrlh %[src1], %[b], %[two] \n\t" "or %[b], %[src0], %[src1] \n\t" "psllh %[src0], %[g], %[two] \n\t" "psrlh %[src1], %[g], %[four] \n\t" "or %[g], %[src0], %[src1] \n\t" "psllh %[src0], %[r], %[three] \n\t" "psrlh %[src1], %[r], %[two] \n\t" "or %[r], %[src0], %[src1] \n\t" "punpcklhw %[src0], %[b], %[r] \n\t" "punpcklhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest2], %[src0], %[src1] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" "punpckhhw %[src0], %[b], %[r] \n\t" "punpckhhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest3], %[src0], %[src1] \n\t" "psrlw %[dest3], %[dest3], %[eight] \n\t" "packsswh %[src_lo], %[dest0], %[dest1] \n\t" "packsswh %[src_hi], %[dest2], %[dest3] \n\t" "packushb %[dest0], %[src_lo], %[src_hi] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" "daddiu %[src_rgb565], %[src_rgb565], 0x10 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddiu %[width], %[width], -0x08 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) : [src_rgb565] "r"(src_rgb565), [dst_y] "r"(dst_y), [value] "f"(value), [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [mask] "f"(mask), [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [four] "f"(0x04) : "memory"); } void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { uint64_t ftmp[11]; const uint64_t value = 0x1080108010801080; const uint64_t mask = 0x0001004200810019; uint64_t c0 = 0x001f001f001f001f; uint64_t c1 = 0x00ff00ff00ff00ff; uint64_t c2 = 0x0003000300030003; uint64_t c3 = 0x007c007c007c007c; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" "psrlh %[src1], %[src0], %[eight] \n\t" "and %[b], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g], %[src1], %[c2] \n\t" "psllh %[g], %[g], %[three] \n\t" "or %[g], %[src0], %[g] \n\t" "and %[r], %[src1], %[c3] \n\t" "psrlh %[r], %[r], %[two] \n\t" "psllh %[src0], %[b], %[three] \n\t" "psrlh %[src1], %[b], %[two] \n\t" "or %[b], %[src0], %[src1] \n\t" "psllh %[src0], %[g], %[three] \n\t" "psrlh %[src1], %[g], %[two] \n\t" "or %[g], %[src0], %[src1] \n\t" "psllh %[src0], %[r], %[three] \n\t" "psrlh %[src1], %[r], %[two] \n\t" "or %[r], %[src0], %[src1] \n\t" "punpcklhw %[src0], %[b], %[r] \n\t" "punpcklhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest0], %[src0], %[src1] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" "punpckhhw %[src0], %[b], %[r] \n\t" "punpckhhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest1], %[src0], %[src1] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" "psrlh %[src1], %[src0], %[eight] \n\t" "and %[b], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g], %[src1], %[c2] \n\t" "psllh %[g], %[g], %[three] \n\t" "or %[g], %[src0], %[g] \n\t" "and %[r], %[src1], %[c3] \n\t" "psrlh %[r], %[r], %[two] \n\t" "psllh %[src0], %[b], %[three] \n\t" "psrlh %[src1], %[b], %[two] \n\t" "or %[b], %[src0], %[src1] \n\t" "psllh %[src0], %[g], %[three] \n\t" "psrlh %[src1], %[g], %[two] \n\t" "or %[g], %[src0], %[src1] \n\t" "psllh %[src0], %[r], %[three] \n\t" "psrlh %[src1], %[r], %[two] \n\t" "or %[r], %[src0], %[src1] \n\t" "punpcklhw %[src0], %[b], %[r] \n\t" "punpcklhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest2], %[src0], %[src1] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" "punpckhhw %[src0], %[b], %[r] \n\t" "punpckhhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest3], %[src0], %[src1] \n\t" "psrlw %[dest3], %[dest3], %[eight] \n\t" "packsswh %[src_lo], %[dest0], %[dest1] \n\t" "packsswh %[src_hi], %[dest2], %[dest3] \n\t" "packushb %[dest0], %[src_lo], %[src_hi] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" "daddiu %[src_argb1555], %[src_argb1555], 0x10 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddiu %[width], %[width], -0x08 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) : [src_argb1555] "r"(src_argb1555), [dst_y] "r"(dst_y), [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [eight] "f"(0x08), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [seven] "f"(0x07) : "memory"); } void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { uint64_t ftmp[11]; uint64_t value = 0x1080108010801080; uint64_t mask = 0x0001004200810019; uint64_t c0 = 0x000f000f000f000f; uint64_t c1 = 0x00ff00ff00ff00ff; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" "psrlh %[src1], %[src0], %[eight] \n\t" "and %[b], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[g], %[src0], %[four] \n\t" "and %[r], %[src1], %[c0] \n\t" "psllh %[src0], %[b], %[four] \n\t" "or %[b], %[src0], %[b] \n\t" "psllh %[src0], %[g], %[four] \n\t" "or %[g], %[src0], %[g] \n\t" "psllh %[src0], %[r], %[four] \n\t" "or %[r], %[src0], %[r] \n\t" "punpcklhw %[src0], %[b], %[r] \n\t" "punpcklhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest0], %[src0], %[src1] \n\t" "psrlw %[dest0], %[dest0], %[eight] \n\t" "punpckhhw %[src0], %[b], %[r] \n\t" "punpckhhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest1], %[src0], %[src1] \n\t" "psrlw %[dest1], %[dest1], %[eight] \n\t" "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" "psrlh %[src1], %[src0], %[eight] \n\t" "and %[b], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[g], %[src0], %[four] \n\t" "and %[r], %[src1], %[c0] \n\t" "psllh %[src0], %[b], %[four] \n\t" "or %[b], %[src0], %[b] \n\t" "psllh %[src0], %[g], %[four] \n\t" "or %[g], %[src0], %[g] \n\t" "psllh %[src0], %[r], %[four] \n\t" "or %[r], %[src0], %[r] \n\t" "punpcklhw %[src0], %[b], %[r] \n\t" "punpcklhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest2], %[src0], %[src1] \n\t" "psrlw %[dest2], %[dest2], %[eight] \n\t" "punpckhhw %[src0], %[b], %[r] \n\t" "punpckhhw %[src1], %[g], %[value] \n\t" "punpcklhw %[src_lo], %[src0], %[src1] \n\t" "punpckhhw %[src_hi], %[src0], %[src1] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask] \n\t" "punpcklwd %[src0], %[src_lo], %[src_hi] \n\t" "punpckhwd %[src1], %[src_lo], %[src_hi] \n\t" "paddw %[dest3], %[src0], %[src1] \n\t" "psrlw %[dest3], %[dest3], %[eight] \n\t" "packsswh %[src_lo], %[dest0], %[dest1] \n\t" "packsswh %[src_hi], %[dest2], %[dest3] \n\t" "packushb %[dest0], %[src_lo], %[src_hi] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_y]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_y]) \n\t" "daddiu %[src_argb4444], %[src_argb4444], 0x10 \n\t" "daddiu %[dst_y], %[dst_y], 0x08 \n\t" "daddiu %[width], %[width], -0x08 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), [b] "=&f"(ftmp[4]), [g] "=&f"(ftmp[5]), [r] "=&f"(ftmp[6]), [dest0] "=&f"(ftmp[7]), [dest1] "=&f"(ftmp[8]), [dest2] "=&f"(ftmp[9]), [dest3] "=&f"(ftmp[10]) : [src_argb4444] "r"(src_argb4444), [dst_y] "r"(dst_y), [width] "r"(width), [value] "f"(value), [mask] "f"(mask), [c0] "f"(c0), [c1] "f"(c1), [eight] "f"(0x08), [four] "f"(0x04) : "memory"); } void RGB565ToUVRow_MMI(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t ftmp[13]; uint64_t value = 0x2020202020202020; uint64_t mask_u = 0x0026004a00700002; uint64_t mask_v = 0x00020070005e0012; uint64_t mask = 0x93; uint64_t c0 = 0x001f001f001f001f; uint64_t c1 = 0x00ff00ff00ff00ff; uint64_t c2 = 0x0007000700070007; __asm__ volatile( "daddu %[next_rgb565], %[src_rgb565], %[next_rgb565] \n\t" "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_rgb565]) \n\t" "gsldlc1 %[src0], 0x07(%[src_rgb565]) \n\t" "gsldrc1 %[src1], 0x00(%[next_rgb565]) \n\t" "gsldlc1 %[src1], 0x07(%[next_rgb565]) \n\t" "psrlh %[dest0_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g0], %[dest0_u], %[c2] \n\t" "psllh %[g0], %[g0], %[three] \n\t" "or %[g0], %[src0], %[g0] \n\t" "psrlh %[r0], %[dest0_u], %[three] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest0_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[src1], %[src1], %[five] \n\t" "and %[dest0_v], %[src0], %[c2] \n\t" "psllh %[dest0_v], %[dest0_v], %[three] \n\t" "or %[dest0_v], %[src1], %[dest0_v] \n\t" "psrlh %[src0], %[src0], %[three] \n\t" "paddh %[b0], %[b0], %[dest0_u] \n\t" "paddh %[g0], %[g0], %[dest0_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" "psrlh %[b0], %[src0], %[six] \n\t" "psllh %[r0], %[src0], %[one] \n\t" "or %[b0], %[b0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" "pshufh %[dest0_u], %[src0], %[mask] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" "psubw %[dest0_u], %[src0], %[src1] \n\t" "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x08(%[src_rgb565]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_rgb565]) \n\t" "gsldrc1 %[src1], 0x08(%[next_rgb565]) \n\t" "gsldlc1 %[src1], 0x0f(%[next_rgb565]) \n\t" "psrlh %[dest1_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g0], %[dest1_u], %[c2] \n\t" "psllh %[g0], %[g0], %[three] \n\t" "or %[g0], %[src0], %[g0] \n\t" "psrlh %[r0], %[dest1_u], %[three] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest1_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[src1], %[src1], %[five] \n\t" "and %[dest1_v], %[src0], %[c2] \n\t" "psllh %[dest1_v], %[dest1_v], %[three] \n\t" "or %[dest1_v], %[src1], %[dest1_v] \n\t" "psrlh %[src0], %[src0], %[three] \n\t" "paddh %[b0], %[b0], %[dest1_u] \n\t" "paddh %[g0], %[g0], %[dest1_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" "psrlh %[b0], %[src0], %[six] \n\t" "psllh %[r0], %[src0], %[one] \n\t" "or %[b0], %[b0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" "pshufh %[dest1_u], %[src0], %[mask] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" "psubw %[dest1_u], %[src0], %[src1] \n\t" "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "gsldrc1 %[src0], 0x10(%[src_rgb565]) \n\t" "gsldlc1 %[src0], 0x17(%[src_rgb565]) \n\t" "gsldrc1 %[src1], 0x10(%[next_rgb565]) \n\t" "gsldlc1 %[src1], 0x17(%[next_rgb565]) \n\t" "psrlh %[dest2_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g0], %[dest2_u], %[c2] \n\t" "psllh %[g0], %[g0], %[three] \n\t" "or %[g0], %[src0], %[g0] \n\t" "psrlh %[r0], %[dest2_u], %[three] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest2_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[src1], %[src1], %[five] \n\t" "and %[dest2_v], %[src0], %[c2] \n\t" "psllh %[dest2_v], %[dest2_v], %[three] \n\t" "or %[dest2_v], %[src1], %[dest2_v] \n\t" "psrlh %[src0], %[src0], %[three] \n\t" "paddh %[b0], %[b0], %[dest2_u] \n\t" "paddh %[g0], %[g0], %[dest2_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" "psrlh %[b0], %[src0], %[six] \n\t" "psllh %[r0], %[src0], %[one] \n\t" "or %[b0], %[b0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" "pshufh %[dest2_u], %[src0], %[mask] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" "psubw %[dest2_u], %[src0], %[src1] \n\t" "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" "gsldrc1 %[src0], 0x18(%[src_rgb565]) \n\t" "gsldlc1 %[src0], 0x1f(%[src_rgb565]) \n\t" "gsldrc1 %[src1], 0x18(%[next_rgb565]) \n\t" "gsldlc1 %[src1], 0x1f(%[next_rgb565]) \n\t" "psrlh %[dest3_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g0], %[dest3_u], %[c2] \n\t" "psllh %[g0], %[g0], %[three] \n\t" "or %[g0], %[src0], %[g0] \n\t" "psrlh %[r0], %[dest3_u], %[three] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest3_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[src1], %[src1], %[five] \n\t" "and %[dest3_v], %[src0], %[c2] \n\t" "psllh %[dest3_v], %[dest3_v], %[three] \n\t" "or %[dest3_v], %[src1], %[dest3_v] \n\t" "psrlh %[src0], %[src0], %[three] \n\t" "paddh %[b0], %[b0], %[dest3_u] \n\t" "paddh %[g0], %[g0], %[dest3_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" "psrlh %[b0], %[src0], %[six] \n\t" "psllh %[r0], %[src0], %[one] \n\t" "or %[b0], %[b0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" "pshufh %[dest3_u], %[src0], %[mask] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" "psubw %[dest3_u], %[src0], %[src1] \n\t" "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" "psubw %[dest3_v], %[src1], %[src0] \n\t" "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" "packushb %[dest0_u], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" "packushb %[dest0_v], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" "daddiu %[src_rgb565], %[src_rgb565], 0x20 \n\t" "daddiu %[next_rgb565], %[next_rgb565], 0x20 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddiu %[width], %[width], -0x10 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), [dest3_v] "=&f"(ftmp[12]) : [src_rgb565] "r"(src_rgb565), [next_rgb565] "r"(src_stride_rgb565), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), [one] "f"(0x01) : "memory"); } void ARGB1555ToUVRow_MMI(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t ftmp[11]; uint64_t value = 0x2020202020202020; uint64_t mask_u = 0x0026004a00700002; uint64_t mask_v = 0x00020070005e0012; uint64_t mask = 0x93; uint64_t c0 = 0x001f001f001f001f; uint64_t c1 = 0x00ff00ff00ff00ff; uint64_t c2 = 0x0003000300030003; uint64_t c3 = 0x007c007c007c007c; __asm__ volatile( "daddu %[next_argb1555], %[src_argb1555], %[next_argb1555] \n\t" "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb1555]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb1555]) \n\t" "gsldrc1 %[src1], 0x00(%[next_argb1555]) \n\t" "gsldlc1 %[src1], 0x07(%[next_argb1555]) \n\t" "psrlh %[dest0_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g0], %[dest0_u], %[c2] \n\t" "psllh %[g0], %[g0], %[three] \n\t" "or %[g0], %[src0], %[g0] \n\t" "and %[r0], %[dest0_u], %[c3] \n\t" "psrlh %[r0], %[r0], %[two] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest0_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[src1], %[src1], %[five] \n\t" "and %[dest0_v], %[src0], %[c2] \n\t" "psllh %[dest0_v], %[dest0_v], %[three] \n\t" "or %[dest0_v], %[src1], %[dest0_v] \n\t" "and %[src0], %[src0], %[c3] \n\t" "psrlh %[src0], %[src0], %[two] \n\t" "paddh %[b0], %[b0], %[dest0_u] \n\t" "paddh %[g0], %[g0], %[dest0_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" "psrlh %[b0], %[src0], %[six] \n\t" "psllh %[r0], %[src0], %[one] \n\t" "or %[b0], %[b0], %[r0] \n\t" "psrlh %[r0], %[g0], %[six] \n\t" "psllh %[g0], %[g0], %[one] \n\t" "or %[g0], %[g0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" "pshufh %[dest0_u], %[src0], %[mask] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" "psubw %[dest0_u], %[src0], %[src1] \n\t" "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x08(%[src_argb1555]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_argb1555]) \n\t" "gsldrc1 %[src1], 0x08(%[next_argb1555]) \n\t" "gsldlc1 %[src1], 0x0f(%[next_argb1555]) \n\t" "psrlh %[dest1_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g0], %[dest1_u], %[c2] \n\t" "psllh %[g0], %[g0], %[three] \n\t" "or %[g0], %[src0], %[g0] \n\t" "and %[r0], %[dest1_u], %[c3] \n\t" "psrlh %[r0], %[r0], %[two] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest1_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[src1], %[src1], %[five] \n\t" "and %[dest1_v], %[src0], %[c2] \n\t" "psllh %[dest1_v], %[dest1_v], %[three] \n\t" "or %[dest1_v], %[src1], %[dest1_v] \n\t" "and %[src0], %[src0], %[c3] \n\t" "psrlh %[src0], %[src0], %[two] \n\t" "paddh %[b0], %[b0], %[dest1_u] \n\t" "paddh %[g0], %[g0], %[dest1_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" "psrlh %[b0], %[src0], %[six] \n\t" "psllh %[r0], %[src0], %[one] \n\t" "or %[b0], %[b0], %[r0] \n\t" "psrlh %[r0], %[g0], %[six] \n\t" "psllh %[g0], %[g0], %[one] \n\t" "or %[g0], %[g0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" "pshufh %[dest1_u], %[src0], %[mask] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" "psubw %[dest1_u], %[src0], %[src1] \n\t" "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "packsswh %[dest0_u], %[dest0_u], %[dest1_u] \n\t" "packsswh %[dest1_u], %[dest0_v], %[dest1_v] \n\t" "gsldrc1 %[src0], 0x10(%[src_argb1555]) \n\t" "gsldlc1 %[src0], 0x17(%[src_argb1555]) \n\t" "gsldrc1 %[src1], 0x10(%[next_argb1555]) \n\t" "gsldlc1 %[src1], 0x17(%[next_argb1555]) \n\t" "psrlh %[dest2_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g0], %[dest2_u], %[c2] \n\t" "psllh %[g0], %[g0], %[three] \n\t" "or %[g0], %[src0], %[g0] \n\t" "and %[r0], %[dest2_u], %[c3] \n\t" "psrlh %[r0], %[r0], %[two] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest2_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[src1], %[src1], %[five] \n\t" "and %[dest0_v], %[src0], %[c2] \n\t" "psllh %[dest0_v], %[dest0_v], %[three] \n\t" "or %[dest0_v], %[src1], %[dest0_v] \n\t" "and %[src0], %[src0], %[c3] \n\t" "psrlh %[src0], %[src0], %[two] \n\t" "paddh %[b0], %[b0], %[dest2_u] \n\t" "paddh %[g0], %[g0], %[dest0_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest2_u], %[dest0_v] \n\t" "psrlh %[b0], %[src0], %[six] \n\t" "psllh %[r0], %[src0], %[one] \n\t" "or %[b0], %[b0], %[r0] \n\t" "psrlh %[r0], %[g0], %[six] \n\t" "psllh %[g0], %[g0], %[one] \n\t" "or %[g0], %[g0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest2_u], %[dest0_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" "pshufh %[dest2_u], %[src0], %[mask] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" "psubw %[dest2_u], %[src0], %[src1] \n\t" "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x18(%[src_argb1555]) \n\t" "gsldlc1 %[src0], 0x1f(%[src_argb1555]) \n\t" "gsldrc1 %[src1], 0x18(%[next_argb1555]) \n\t" "gsldlc1 %[src1], 0x1f(%[next_argb1555]) \n\t" "psrlh %[dest3_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[src0], %[src0], %[five] \n\t" "and %[g0], %[dest3_u], %[c2] \n\t" "psllh %[g0], %[g0], %[three] \n\t" "or %[g0], %[src0], %[g0] \n\t" "and %[r0], %[dest3_u], %[c3] \n\t" "psrlh %[r0], %[r0], %[two] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest3_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[src1], %[src1], %[five] \n\t" "and %[dest1_v], %[src0], %[c2] \n\t" "psllh %[dest1_v], %[dest1_v], %[three] \n\t" "or %[dest1_v], %[src1], %[dest1_v] \n\t" "and %[src0], %[src0], %[c3] \n\t" "psrlh %[src0], %[src0], %[two] \n\t" "paddh %[b0], %[b0], %[dest3_u] \n\t" "paddh %[g0], %[g0], %[dest1_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest3_u], %[dest1_v] \n\t" "psrlh %[b0], %[src0], %[six] \n\t" "psllh %[r0], %[src0], %[one] \n\t" "or %[b0], %[b0], %[r0] \n\t" "psrlh %[r0], %[g0], %[six] \n\t" "psllh %[g0], %[g0], %[one] \n\t" "or %[g0], %[g0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest3_u], %[dest1_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" "pshufh %[dest3_u], %[src0], %[mask] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" "psubw %[dest3_u], %[src0], %[src1] \n\t" "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" "packushb %[dest0_u], %[dest0_u], %[src1] \n\t" "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" "packsswh %[src1], %[dest0_v], %[dest1_v] \n\t" "packushb %[dest0_v], %[dest1_u], %[src1] \n\t" "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" "daddiu %[src_argb1555], %[src_argb1555], 0x20 \n\t" "daddiu %[next_argb1555], %[next_argb1555], 0x20 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddiu %[width], %[width], -0x10 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), [dest1_v] "=&f"(ftmp[10]) : [src_argb1555] "r"(src_argb1555), [next_argb1555] "r"(src_stride_argb1555), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [c2] "f"(c2), [c3] "f"(c3), [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [eight] "f"(0x08), [six] "f"(0x06), [five] "f"(0x05), [three] "f"(0x03), [two] "f"(0x02), [one] "f"(0x01) : "memory"); } void ARGB4444ToUVRow_MMI(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t ftmp[13]; uint64_t value = 0x2020202020202020; uint64_t mask_u = 0x0026004a00700002; uint64_t mask_v = 0x00020070005e0012; uint64_t mask = 0x93; uint64_t c0 = 0x000f000f000f000f; uint64_t c1 = 0x00ff00ff00ff00ff; __asm__ volatile( "daddu %[next_argb4444], %[src_argb4444], %[next_argb4444] \n\t" "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb4444]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb4444]) \n\t" "gsldrc1 %[src1], 0x00(%[next_argb4444]) \n\t" "gsldlc1 %[src1], 0x07(%[next_argb4444]) \n\t" "psrlh %[dest0_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[g0], %[src0], %[four] \n\t" "and %[r0], %[dest0_u], %[c0] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest0_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[dest0_v], %[src1], %[four] \n\t" "and %[src0], %[src0], %[c0] \n\t" "paddh %[b0], %[b0], %[dest0_u] \n\t" "paddh %[g0], %[g0], %[dest0_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest0_u], %[dest0_v] \n\t" "psrlh %[b0], %[src0], %[four] \n\t" "psllh %[r0], %[src0], %[two] \n\t" "or %[b0], %[b0], %[r0] \n\t" "psrlh %[r0], %[g0], %[four] \n\t" "psllh %[g0], %[g0], %[two] \n\t" "or %[g0], %[g0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest0_u], %[src0], %[src1] \n\t" "punpckhwd %[dest0_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest0_u], %[dest0_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest0_v], %[src0], %[mask_v] \n\t" "pshufh %[dest0_u], %[src0], %[mask] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest0_u], %[b0] \n\t" "punpckhwd %[src1], %[dest0_u], %[b0] \n\t" "psubw %[dest0_u], %[src0], %[src1] \n\t" "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[g0] \n\t" "punpckhwd %[src1], %[dest0_v], %[g0] \n\t" "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x08(%[src_argb4444]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_argb4444]) \n\t" "gsldrc1 %[src1], 0x08(%[next_argb4444]) \n\t" "gsldlc1 %[src1], 0x0f(%[next_argb4444]) \n\t" "psrlh %[dest1_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[g0], %[src0], %[four] \n\t" "and %[r0], %[dest1_u], %[c0] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest1_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[dest1_v], %[src1], %[four] \n\t" "and %[src0], %[src0], %[c0] \n\t" "paddh %[b0], %[b0], %[dest1_u] \n\t" "paddh %[g0], %[g0], %[dest1_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest1_u], %[dest1_v] \n\t" "psrlh %[b0], %[src0], %[four] \n\t" "psllh %[r0], %[src0], %[two] \n\t" "or %[b0], %[b0], %[r0] \n\t" "psrlh %[r0], %[g0], %[four] \n\t" "psllh %[g0], %[g0], %[two] \n\t" "or %[g0], %[g0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest1_u], %[src0], %[src1] \n\t" "punpckhwd %[dest1_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest1_u], %[dest1_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest1_v], %[src0], %[mask_v] \n\t" "pshufh %[dest1_u], %[src0], %[mask] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest1_u], %[b0] \n\t" "punpckhwd %[src1], %[dest1_u], %[b0] \n\t" "psubw %[dest1_u], %[src0], %[src1] \n\t" "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[g0] \n\t" "punpckhwd %[src1], %[dest1_v], %[g0] \n\t" "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "gsldrc1 %[src0], 0x10(%[src_argb4444]) \n\t" "gsldlc1 %[src0], 0x17(%[src_argb4444]) \n\t" "gsldrc1 %[src1], 0x10(%[next_argb4444]) \n\t" "gsldlc1 %[src1], 0x17(%[next_argb4444]) \n\t" "psrlh %[dest2_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[g0], %[src0], %[four] \n\t" "and %[r0], %[dest2_u], %[c0] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest2_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[dest2_v], %[src1], %[four] \n\t" "and %[src0], %[src0], %[c0] \n\t" "paddh %[b0], %[b0], %[dest2_u] \n\t" "paddh %[g0], %[g0], %[dest2_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest2_u], %[dest2_v] \n\t" "psrlh %[b0], %[src0], %[four] \n\t" "psllh %[r0], %[src0], %[two] \n\t" "or %[b0], %[b0], %[r0] \n\t" "psrlh %[r0], %[g0], %[four] \n\t" "psllh %[g0], %[g0], %[two] \n\t" "or %[g0], %[g0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest2_u], %[src0], %[src1] \n\t" "punpckhwd %[dest2_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest2_u], %[dest2_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest2_v], %[src0], %[mask_v] \n\t" "pshufh %[dest2_u], %[src0], %[mask] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest2_u], %[b0] \n\t" "punpckhwd %[src1], %[dest2_u], %[b0] \n\t" "psubw %[dest2_u], %[src0], %[src1] \n\t" "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" "punpcklwd %[src0], %[dest2_v], %[g0] \n\t" "punpckhwd %[src1], %[dest2_v], %[g0] \n\t" "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" "gsldrc1 %[src0], 0x18(%[src_argb4444]) \n\t" "gsldlc1 %[src0], 0x1f(%[src_argb4444]) \n\t" "gsldrc1 %[src1], 0x18(%[next_argb4444]) \n\t" "gsldlc1 %[src1], 0x1f(%[next_argb4444]) \n\t" "psrlh %[dest3_u], %[src0], %[eight] \n\t" "and %[b0], %[src0], %[c0] \n\t" "and %[src0], %[src0], %[c1] \n\t" "psrlh %[g0], %[src0], %[four] \n\t" "and %[r0], %[dest3_u], %[c0] \n\t" "psrlh %[src0], %[src1], %[eight] \n\t" "and %[dest3_u], %[src1], %[c0] \n\t" "and %[src1], %[src1], %[c1] \n\t" "psrlh %[dest3_v], %[src1], %[four] \n\t" "and %[src0], %[src0], %[c0] \n\t" "paddh %[b0], %[b0], %[dest3_u] \n\t" "paddh %[g0], %[g0], %[dest3_v] \n\t" "paddh %[r0], %[r0], %[src0] \n\t" "punpcklhw %[src0], %[b0], %[r0] \n\t" "punpckhhw %[src1], %[b0], %[r0] \n\t" "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" "paddh %[src0], %[dest3_u], %[dest3_v] \n\t" "psrlh %[b0], %[src0], %[four] \n\t" "psllh %[r0], %[src0], %[two] \n\t" "or %[b0], %[b0], %[r0] \n\t" "psrlh %[r0], %[g0], %[four] \n\t" "psllh %[g0], %[g0], %[two] \n\t" "or %[g0], %[g0], %[r0] \n\t" "punpcklhw %[src0], %[g0], %[value] \n\t" "punpckhhw %[src1], %[g0], %[value] \n\t" "punpcklwd %[dest3_u], %[src0], %[src1] \n\t" "punpckhwd %[dest3_v], %[src0], %[src1] \n\t" "paddh %[g0], %[dest3_u], %[dest3_v] \n\t" "punpcklhw %[src0], %[b0], %[g0] \n\t" "punpckhhw %[src1], %[b0], %[g0] \n\t" "pmaddhw %[dest3_v], %[src0], %[mask_v] \n\t" "pshufh %[dest3_u], %[src0], %[mask] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[g0], %[src1], %[mask_v] \n\t" "pshufh %[b0], %[src1], %[mask] \n\t" "pmaddhw %[b0], %[b0], %[mask_u] \n\t" "punpcklwd %[src0], %[dest3_u], %[b0] \n\t" "punpckhwd %[src1], %[dest3_u], %[b0] \n\t" "psubw %[dest3_u], %[src0], %[src1] \n\t" "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" "punpcklwd %[src0], %[dest3_v], %[g0] \n\t" "punpckhwd %[src1], %[dest3_v], %[g0] \n\t" "psubw %[dest3_v], %[src1], %[src0] \n\t" "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" "packushb %[dest0_u], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" "packushb %[dest0_v], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" "daddiu %[src_argb4444], %[src_argb4444], 0x20 \n\t" "daddiu %[next_argb4444], %[next_argb4444], 0x20 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddiu %[width], %[width], -0x10 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [b0] "=&f"(ftmp[2]), [g0] "=&f"(ftmp[3]), [r0] "=&f"(ftmp[4]), [dest0_u] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest2_u] "=&f"(ftmp[7]), [dest3_u] "=&f"(ftmp[8]), [dest0_v] "=&f"(ftmp[9]), [dest1_v] "=&f"(ftmp[10]), [dest2_v] "=&f"(ftmp[11]), [dest3_v] "=&f"(ftmp[12]) : [src_argb4444] "r"(src_argb4444), [next_argb4444] "r"(src_stride_argb4444), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [value] "f"(value), [c0] "f"(c0), [c1] "f"(c1), [mask] "f"(mask), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [eight] "f"(0x08), [four] "f"(0x04), [two] "f"(0x02) : "memory"); } void ARGBToUV444Row_MMI(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t ftmp[12]; const uint64_t value = 0x4040; const uint64_t mask_u = 0x0026004a00700002; const uint64_t mask_v = 0x00020070005e0012; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_argb]) \n\t" "gsldlc1 %[src0], 0x07(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "dsll %[dest0_u], %[src_lo], %[sixteen] \n\t" "pinsrh_0 %[dest0_u], %[dest0_u], %[value] \n\t" "pinsrh_3 %[dest0_v], %[src_lo], %[value] \n\t" "pmaddhw %[dest0_u], %[dest0_u], %[mask_u] \n\t" "pmaddhw %[dest0_v], %[dest0_v], %[mask_v] \n\t" "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest0_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest0_u], %[src_lo] \n\t" "psubw %[dest0_u], %[src0], %[src1] \n\t" "psraw %[dest0_u], %[dest0_u], %[eight] \n\t" "punpcklwd %[src0], %[dest0_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest0_v], %[src_hi] \n\t" "psubw %[dest0_v], %[src1], %[src0] \n\t" "psraw %[dest0_v], %[dest0_v], %[eight] \n\t" "gsldrc1 %[src0], 0x08(%[src_argb]) \n\t" "gsldlc1 %[src0], 0x0f(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "dsll %[dest1_u], %[src_lo], %[sixteen] \n\t" "pinsrh_0 %[dest1_u], %[dest1_u], %[value] \n\t" "pinsrh_3 %[dest1_v], %[src_lo], %[value] \n\t" "pmaddhw %[dest1_u], %[dest1_u], %[mask_u] \n\t" "pmaddhw %[dest1_v], %[dest1_v], %[mask_v] \n\t" "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest1_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest1_u], %[src_lo] \n\t" "psubw %[dest1_u], %[src0], %[src1] \n\t" "psraw %[dest1_u], %[dest1_u], %[eight] \n\t" "punpcklwd %[src0], %[dest1_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest1_v], %[src_hi] \n\t" "psubw %[dest1_v], %[src1], %[src0] \n\t" "psraw %[dest1_v], %[dest1_v], %[eight] \n\t" "gsldrc1 %[src0], 0x10(%[src_argb]) \n\t" "gsldlc1 %[src0], 0x17(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "dsll %[dest2_u], %[src_lo], %[sixteen] \n\t" "pinsrh_0 %[dest2_u], %[dest2_u], %[value] \n\t" "pinsrh_3 %[dest2_v], %[src_lo], %[value] \n\t" "pmaddhw %[dest2_u], %[dest2_u], %[mask_u] \n\t" "pmaddhw %[dest2_v], %[dest2_v], %[mask_v] \n\t" "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest2_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest2_u], %[src_lo] \n\t" "psubw %[dest2_u], %[src0], %[src1] \n\t" "psraw %[dest2_u], %[dest2_u], %[eight] \n\t" "punpcklwd %[src0], %[dest2_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest2_v], %[src_hi] \n\t" "psubw %[dest2_v], %[src1], %[src0] \n\t" "psraw %[dest2_v], %[dest2_v], %[eight] \n\t" "gsldrc1 %[src0], 0x18(%[src_argb]) \n\t" "gsldlc1 %[src0], 0x1f(%[src_argb]) \n\t" "punpcklbh %[src_lo], %[src0], %[zero] \n\t" "punpckhbh %[src_hi], %[src0], %[zero] \n\t" "dsll %[dest3_u], %[src_lo], %[sixteen] \n\t" "pinsrh_0 %[dest3_u], %[dest3_u], %[value] \n\t" "pinsrh_3 %[dest3_v], %[src_lo], %[value] \n\t" "pmaddhw %[dest3_u], %[dest3_u], %[mask_u] \n\t" "pmaddhw %[dest3_v], %[dest3_v], %[mask_v] \n\t" "dsll %[src_lo], %[src_hi], %[sixteen] \n\t" "pinsrh_0 %[src_lo], %[src_lo], %[value] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[value] \n\t" "pmaddhw %[src_lo], %[src_lo], %[mask_u] \n\t" "pmaddhw %[src_hi], %[src_hi], %[mask_v] \n\t" "punpcklwd %[src0], %[dest3_u], %[src_lo] \n\t" "punpckhwd %[src1], %[dest3_u], %[src_lo] \n\t" "psubw %[dest3_u], %[src0], %[src1] \n\t" "psraw %[dest3_u], %[dest3_u], %[eight] \n\t" "punpcklwd %[src0], %[dest3_v], %[src_hi] \n\t" "punpckhwd %[src1], %[dest3_v], %[src_hi] \n\t" "psubw %[dest3_v], %[src1], %[src0] \n\t" "psraw %[dest3_v], %[dest3_v], %[eight] \n\t" "packsswh %[src0], %[dest0_u], %[dest1_u] \n\t" "packsswh %[src1], %[dest2_u], %[dest3_u] \n\t" "packushb %[dest0_u], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_u], 0x07(%[dst_u]) \n\t" "gssdrc1 %[dest0_u], 0x00(%[dst_u]) \n\t" "packsswh %[src0], %[dest0_v], %[dest1_v] \n\t" "packsswh %[src1], %[dest2_v], %[dest3_v] \n\t" "packushb %[dest0_v], %[src0], %[src1] \n\t" "gssdlc1 %[dest0_v], 0x07(%[dst_v]) \n\t" "gssdrc1 %[dest0_v], 0x00(%[dst_v]) \n\t" "daddiu %[src_argb], %[src_argb], 0x20 \n\t" "daddiu %[dst_u], %[dst_u], 0x08 \n\t" "daddiu %[dst_v], %[dst_v], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bgtz %[width], 1b \n\t" : [src0] "=&f"(ftmp[0]), [src1] "=&f"(ftmp[1]), [src_lo] "=&f"(ftmp[2]), [src_hi] "=&f"(ftmp[3]), [dest0_u] "=&f"(ftmp[4]), [dest0_v] "=&f"(ftmp[5]), [dest1_u] "=&f"(ftmp[6]), [dest1_v] "=&f"(ftmp[7]), [dest2_u] "=&f"(ftmp[8]), [dest2_v] "=&f"(ftmp[9]), [dest3_u] "=&f"(ftmp[10]), [dest3_v] "=&f"(ftmp[11]) : [src_argb] "r"(src_argb), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [mask_u] "f"(mask_u), [mask_v] "f"(mask_v), [value] "f"(value), [zero] "f"(0x00), [sixteen] "f"(0x10), [eight] "f"(0x08) : "memory"); } void ARGBGrayRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uint64_t src, src_lo, src_hi, src37, dest, dest_lo, dest_hi; uint64_t tmp0, tmp1; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x01; const uint64_t mask2 = 0x0080004D0096001DULL; const uint64_t mask3 = 0xFF000000FF000000ULL; const uint64_t mask4 = ~mask3; const uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "and %[src37], %[src], %[mask3] \n\t" "punpcklbh %[src_lo], %[src], %[mask0] \n\t" "pinsrh_3 %[src_lo], %[src_lo], %[mask1] \n\t" "pmaddhw %[dest_lo], %[src_lo], %[mask2] \n\t" "punpcklwd %[tmp0], %[dest_lo], %[dest_lo] \n\t" "punpckhwd %[tmp1], %[dest_lo], %[dest_lo] \n\t" "paddw %[dest_lo], %[tmp0], %[tmp1] \n\t" "psrlw %[dest_lo], %[dest_lo], %[shift] \n\t" "packsswh %[dest_lo], %[dest_lo], %[dest_lo] \n\t" "punpckhbh %[src_hi], %[src], %[mask0] \n\t" "pinsrh_3 %[src_hi], %[src_hi], %[mask1] \n\t" "pmaddhw %[dest_hi], %[src_hi], %[mask2] \n\t" "punpcklwd %[tmp0], %[dest_hi], %[dest_hi] \n\t" "punpckhwd %[tmp1], %[dest_hi], %[dest_hi] \n\t" "paddw %[dest_hi], %[tmp0], %[tmp1] \n\t" "psrlw %[dest_hi], %[dest_hi], %[shift] \n\t" "packsswh %[dest_hi], %[dest_hi], %[dest_hi] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "and %[dest], %[dest], %[mask4] \n\t" "or %[dest], %[dest], %[src37] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [src] "=&f"(src), [dest] "=&f"(dest), [src37] "=&f"(src37) : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift] "f"(shift), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4) : "memory"); } // Convert a row of image to Sepia tone. void ARGBSepiaRow_MMI(uint8_t* dst_argb, int width) { uint64_t dest, dest_lo, dest_hi, dest37, dest0, dest1, dest2; uint64_t tmp0, tmp1; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x002300440011ULL; const uint64_t mask2 = 0x002D00580016ULL; const uint64_t mask3 = 0x003200620018ULL; const uint64_t mask4 = 0xFF000000FF000000ULL; const uint64_t shift = 0x07; __asm__ volatile( "1: \n\t" "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "and %[dest37], %[dest], %[mask4] \n\t" "punpcklbh %[dest_lo], %[dest], %[mask0] \n\t" "pmaddhw %[dest0], %[dest_lo], %[mask1] \n\t" "pmaddhw %[dest1], %[dest_lo], %[mask2] \n\t" "pmaddhw %[dest2], %[dest_lo], %[mask3] \n\t" "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" "paddw %[dest0], %[tmp0], %[tmp1] \n\t" "psrlw %[dest0], %[dest0], %[shift] \n\t" "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" "paddw %[dest1], %[tmp0], %[tmp1] \n\t" "psrlw %[dest1], %[dest1], %[shift] \n\t" "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" "punpckhbh %[dest_hi], %[dest], %[mask0] \n\t" "pmaddhw %[dest0], %[dest_hi], %[mask1] \n\t" "pmaddhw %[dest1], %[dest_hi], %[mask2] \n\t" "pmaddhw %[dest2], %[dest_hi], %[mask3] \n\t" "punpcklwd %[tmp0], %[dest0], %[dest1] \n\t" "punpckhwd %[tmp1], %[dest0], %[dest1] \n\t" "paddw %[dest0], %[tmp0], %[tmp1] \n\t" "psrlw %[dest0], %[dest0], %[shift] \n\t" "punpcklwd %[tmp0], %[dest2], %[mask0] \n\t" "punpckhwd %[tmp1], %[dest2], %[mask0] \n\t" "paddw %[dest1], %[tmp0], %[tmp1] \n\t" "psrlw %[dest1], %[dest1], %[shift] \n\t" "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "or %[dest], %[dest], %[dest37] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [dest] "=&f"(dest) : [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), [shift] "f"(shift) : "memory"); } // Apply color matrix to a row of image. Matrix is signed. // TODO(fbarchard): Consider adding rounding (+32). void ARGBColorMatrixRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi, dest0, dest1, dest2, dest3; uint64_t matrix, matrix_hi, matrix_lo; uint64_t tmp0, tmp1; const uint64_t shift0 = 0x06; const uint64_t shift1 = 0x08; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x08; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "punpcklbh %[src_lo], %[src], %[mask0] \n\t" "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" "paddw %[dest0], %[tmp0], %[tmp1] \n\t" "psraw %[dest0], %[dest0], %[shift0] \n\t" "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" "pmaddhw %[dest_lo], %[src_lo], %[matrix_lo] \n\t" "pmaddhw %[dest_hi], %[src_lo], %[matrix_hi] \n\t" "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" "paddw %[dest1], %[tmp0], %[tmp1] \n\t" "psraw %[dest1], %[dest1], %[shift0] \n\t" "punpckhbh %[src_hi], %[src], %[mask0] \n\t" "gsldlc1 %[matrix], 0x07(%[matrix_ptr]) \n\t" "gsldrc1 %[matrix], 0x00(%[matrix_ptr]) \n\t" "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" "paddw %[dest2], %[tmp0], %[tmp1] \n\t" "psraw %[dest2], %[dest2], %[shift0] \n\t" "gsldlc1 %[matrix], 0x0f(%[matrix_ptr]) \n\t" "gsldrc1 %[matrix], 0x08(%[matrix_ptr]) \n\t" "punpcklbh %[matrix_lo], %[matrix], %[mask0] \n\t" "psllh %[matrix_lo], %[matrix_lo], %[shift1] \n\t" "psrah %[matrix_lo], %[matrix_lo], %[shift1] \n\t" "punpckhbh %[matrix_hi], %[matrix], %[mask0] \n\t" "psllh %[matrix_hi], %[matrix_hi], %[shift1] \n\t" "psrah %[matrix_hi], %[matrix_hi], %[shift1] \n\t" "pmaddhw %[dest_lo], %[src_hi], %[matrix_lo] \n\t" "pmaddhw %[dest_hi], %[src_hi], %[matrix_hi] \n\t" "punpcklwd %[tmp0], %[dest_lo], %[dest_hi] \n\t" "punpckhwd %[tmp1], %[dest_lo], %[dest_hi] \n\t" "paddw %[dest3], %[tmp0], %[tmp1] \n\t" "psraw %[dest3], %[dest3], %[shift0] \n\t" "packsswh %[tmp0], %[dest0], %[dest1] \n\t" "packsswh %[tmp1], %[dest2], %[dest3] \n\t" "packushb %[dest], %[tmp0], %[tmp1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [matrix_hi] "=&f"(matrix_hi), [matrix_lo] "=&f"(matrix_lo), [matrix] "=&f"(matrix) : [src_ptr] "r"(src_argb), [matrix_ptr] "r"(matrix_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), [shift0] "f"(shift0), [shift1] "f"(shift1), [mask0] "f"(mask0), [mask1] "f"(mask1) : "memory"); } void ARGBShadeRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { uint64_t src, src_hi, src_lo, dest, dest_lo, dest_hi; const uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "punpcklbh %[src_lo], %[src], %[src] \n\t" "punpckhbh %[src_hi], %[src], %[src] \n\t" "punpcklbh %[value], %[value], %[value] \n\t" "pmulhuh %[dest_lo], %[src_lo], %[value] \n\t" "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" "pmulhuh %[dest_hi], %[src_hi], %[value] \n\t" "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src] "=&f"(src), [dest] "=&f"(dest) : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(width), [value] "f"(value), [shift] "f"(shift) : "memory"); } void ARGBMultiplyRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { uint64_t src0, src0_hi, src0_lo, src1, src1_hi, src1_lo; uint64_t dest, dest_lo, dest_hi; const uint64_t mask = 0x0; __asm__ volatile( "1: \n\t" "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" "punpcklbh %[src0_lo], %[src0], %[src0] \n\t" "punpckhbh %[src0_hi], %[src0], %[src0] \n\t" "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" "punpcklbh %[src1_lo], %[src1], %[mask] \n\t" "punpckhbh %[src1_hi], %[src1], %[mask] \n\t" "pmulhuh %[dest_lo], %[src0_lo], %[src1_lo] \n\t" "pmulhuh %[dest_hi], %[src0_hi], %[src1_hi] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width), [mask] "f"(mask) : "memory"); } void ARGBAddRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { uint64_t src0, src1, dest; __asm__ volatile( "1: \n\t" "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" "paddusb %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width) : "memory"); } void ARGBSubtractRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { uint64_t src0, src1, dest; __asm__ volatile( "1: \n\t" "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" "psubusb %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [width] "r"(width) : "memory"); } // Sobel functions which mimics SSSE3. void SobelXRow_MMI(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { uint64_t y00 = 0, y10 = 0, y20 = 0; uint64_t y02 = 0, y12 = 0, y22 = 0; uint64_t zero = 0x0; uint64_t sobel = 0x0; __asm__ volatile( "1: \n\t" "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // a_sub=src_y0[i+2] "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // b=src_y1[i] "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // b_sub=src_y1[i+2] "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" "gsldlc1 %[y20], 0x07(%[src_y2]) \n\t" // c=src_y2[i] "gsldrc1 %[y20], 0x00(%[src_y2]) \n\t" "gsldlc1 %[y22], 0x09(%[src_y2]) \n\t" // c_sub=src_y2[i+2] "gsldrc1 %[y22], 0x02(%[src_y2]) \n\t" "punpcklbh %[y00], %[y00], %[zero] \n\t" "punpcklbh %[y10], %[y10], %[zero] \n\t" "punpcklbh %[y20], %[y20], %[zero] \n\t" "punpcklbh %[y02], %[y02], %[zero] \n\t" "punpcklbh %[y12], %[y12], %[zero] \n\t" "punpcklbh %[y22], %[y22], %[zero] \n\t" "paddh %[y00], %[y00], %[y10] \n\t" // a+b "paddh %[y20], %[y20], %[y10] \n\t" // c+b "paddh %[y00], %[y00], %[y20] \n\t" // a+2b+c "paddh %[y02], %[y02], %[y12] \n\t" // a_sub+b_sub "paddh %[y22], %[y22], %[y12] \n\t" // c_sub+b_sub "paddh %[y02], %[y02], %[y22] \n\t" // a_sub+2b_sub+c_sub "pmaxsh %[y10], %[y00], %[y02] \n\t" "pminsh %[y20], %[y00], %[y02] \n\t" "psubh %[sobel], %[y10], %[y20] \n\t" // Abs "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" "gsldlc1 %[y20], 0x0B(%[src_y2]) \n\t" "gsldrc1 %[y20], 0x04(%[src_y2]) \n\t" "gsldlc1 %[y22], 0x0D(%[src_y2]) \n\t" "gsldrc1 %[y22], 0x06(%[src_y2]) \n\t" "punpcklbh %[y00], %[y00], %[zero] \n\t" "punpcklbh %[y10], %[y10], %[zero] \n\t" "punpcklbh %[y20], %[y20], %[zero] \n\t" "punpcklbh %[y02], %[y02], %[zero] \n\t" "punpcklbh %[y12], %[y12], %[zero] \n\t" "punpcklbh %[y22], %[y22], %[zero] \n\t" "paddh %[y00], %[y00], %[y10] \n\t" "paddh %[y20], %[y20], %[y10] \n\t" "paddh %[y00], %[y00], %[y20] \n\t" "paddh %[y02], %[y02], %[y12] \n\t" "paddh %[y22], %[y22], %[y12] \n\t" "paddh %[y02], %[y02], %[y22] \n\t" "pmaxsh %[y10], %[y00], %[y02] \n\t" "pminsh %[y20], %[y00], %[y02] \n\t" "psubh %[y00], %[y10], %[y20] \n\t" "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 "gssdrc1 %[sobel], 0(%[dst_sobelx]) \n\t" "gssdlc1 %[sobel], 7(%[dst_sobelx]) \n\t" "daddiu %[src_y0], %[src_y0], 8 \n\t" "daddiu %[src_y1], %[src_y1], 8 \n\t" "daddiu %[src_y2], %[src_y2], 8 \n\t" "daddiu %[dst_sobelx], %[dst_sobelx], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y10] "=&f"(y10), [y20] "=&f"(y20), [y02] "=&f"(y02), [y12] "=&f"(y12), [y22] "=&f"(y22) : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [src_y2] "r"(src_y2), [dst_sobelx] "r"(dst_sobelx), [width] "r"(width), [zero] "f"(zero) : "memory"); } void SobelYRow_MMI(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { uint64_t y00 = 0, y01 = 0, y02 = 0; uint64_t y10 = 0, y11 = 0, y12 = 0; uint64_t zero = 0x0; uint64_t sobel = 0x0; __asm__ volatile( "1: \n\t" "gsldlc1 %[y00], 0x07(%[src_y0]) \n\t" // a=src_y0[i] "gsldrc1 %[y00], 0x00(%[src_y0]) \n\t" "gsldlc1 %[y01], 0x08(%[src_y0]) \n\t" // b=src_y0[i+1] "gsldrc1 %[y01], 0x01(%[src_y0]) \n\t" "gsldlc1 %[y02], 0x09(%[src_y0]) \n\t" // c=src_y0[i+2] "gsldrc1 %[y02], 0x02(%[src_y0]) \n\t" "gsldlc1 %[y10], 0x07(%[src_y1]) \n\t" // a_sub=src_y1[i] "gsldrc1 %[y10], 0x00(%[src_y1]) \n\t" "gsldlc1 %[y11], 0x08(%[src_y1]) \n\t" // b_sub=src_y1[i+1] "gsldrc1 %[y11], 0x01(%[src_y1]) \n\t" "gsldlc1 %[y12], 0x09(%[src_y1]) \n\t" // c_sub=src_y1[i+2] "gsldrc1 %[y12], 0x02(%[src_y1]) \n\t" "punpcklbh %[y00], %[y00], %[zero] \n\t" "punpcklbh %[y01], %[y01], %[zero] \n\t" "punpcklbh %[y02], %[y02], %[zero] \n\t" "punpcklbh %[y10], %[y10], %[zero] \n\t" "punpcklbh %[y11], %[y11], %[zero] \n\t" "punpcklbh %[y12], %[y12], %[zero] \n\t" "paddh %[y00], %[y00], %[y01] \n\t" // a+b "paddh %[y02], %[y02], %[y01] \n\t" // c+b "paddh %[y00], %[y00], %[y02] \n\t" // a+2b+c "paddh %[y10], %[y10], %[y11] \n\t" // a_sub+b_sub "paddh %[y12], %[y12], %[y11] \n\t" // c_sub+b_sub "paddh %[y10], %[y10], %[y12] \n\t" // a_sub+2b_sub+c_sub "pmaxsh %[y02], %[y00], %[y10] \n\t" "pminsh %[y12], %[y00], %[y10] \n\t" "psubh %[sobel], %[y02], %[y12] \n\t" // Abs "gsldlc1 %[y00], 0x0B(%[src_y0]) \n\t" "gsldrc1 %[y00], 0x04(%[src_y0]) \n\t" "gsldlc1 %[y01], 0x0C(%[src_y0]) \n\t" "gsldrc1 %[y01], 0x05(%[src_y0]) \n\t" "gsldlc1 %[y02], 0x0D(%[src_y0]) \n\t" "gsldrc1 %[y02], 0x06(%[src_y0]) \n\t" "gsldlc1 %[y10], 0x0B(%[src_y1]) \n\t" "gsldrc1 %[y10], 0x04(%[src_y1]) \n\t" "gsldlc1 %[y11], 0x0C(%[src_y1]) \n\t" "gsldrc1 %[y11], 0x05(%[src_y1]) \n\t" "gsldlc1 %[y12], 0x0D(%[src_y1]) \n\t" "gsldrc1 %[y12], 0x06(%[src_y1]) \n\t" "punpcklbh %[y00], %[y00], %[zero] \n\t" "punpcklbh %[y01], %[y01], %[zero] \n\t" "punpcklbh %[y02], %[y02], %[zero] \n\t" "punpcklbh %[y10], %[y10], %[zero] \n\t" "punpcklbh %[y11], %[y11], %[zero] \n\t" "punpcklbh %[y12], %[y12], %[zero] \n\t" "paddh %[y00], %[y00], %[y01] \n\t" "paddh %[y02], %[y02], %[y01] \n\t" "paddh %[y00], %[y00], %[y02] \n\t" "paddh %[y10], %[y10], %[y11] \n\t" "paddh %[y12], %[y12], %[y11] \n\t" "paddh %[y10], %[y10], %[y12] \n\t" "pmaxsh %[y02], %[y00], %[y10] \n\t" "pminsh %[y12], %[y00], %[y10] \n\t" "psubh %[y00], %[y02], %[y12] \n\t" "packushb %[sobel], %[sobel], %[y00] \n\t" // clamp255 "gssdrc1 %[sobel], 0(%[dst_sobely]) \n\t" "gssdlc1 %[sobel], 7(%[dst_sobely]) \n\t" "daddiu %[src_y0], %[src_y0], 8 \n\t" "daddiu %[src_y1], %[src_y1], 8 \n\t" "daddiu %[dst_sobely], %[dst_sobely], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [sobel] "=&f"(sobel), [y00] "=&f"(y00), [y01] "=&f"(y01), [y02] "=&f"(y02), [y10] "=&f"(y10), [y11] "=&f"(y11), [y12] "=&f"(y12) : [src_y0] "r"(src_y0), [src_y1] "r"(src_y1), [dst_sobely] "r"(dst_sobely), [width] "r"(width), [zero] "f"(zero) : "memory"); } void SobelRow_MMI(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { double temp[3]; uint64_t c1 = 0xff000000ff000000; __asm__ volatile( "1: \n\t" "gsldlc1 %[t0], 0x07(%[src_sobelx]) \n\t" // a=src_sobelx[i] "gsldrc1 %[t0], 0x00(%[src_sobelx]) \n\t" "gsldlc1 %[t1], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] "gsldrc1 %[t1], 0x00(%[src_sobely]) \n\t" // s7 s6 s5 s4 s3 s2 s1 s0 = a+b "paddusb %[t2] , %[t0], %[t1] \n\t" // s3 s2 s1 s0->s3 s3 s2 s2 s1 s1 s0 s0 "punpcklbh %[t0], %[t2], %[t2] \n\t" // s1 s1 s0 s0->s1 s2 s1 s1 s0 s0 s0 s0 "punpcklbh %[t1], %[t0], %[t0] \n\t" "or %[t1], %[t1], %[c1] \n\t" // 255 s1 s1 s1 s55 s0 s0 s0 "gssdrc1 %[t1], 0x00(%[dst_argb]) \n\t" "gssdlc1 %[t1], 0x07(%[dst_argb]) \n\t" // s3 s3 s2 s2->s3 s3 s3 s3 s2 s2 s2 s2 "punpckhbh %[t1], %[t0], %[t0] \n\t" "or %[t1], %[t1], %[c1] \n\t" // 255 s3 s3 s3 255 s2 s2 s2 "gssdrc1 %[t1], 0x08(%[dst_argb]) \n\t" "gssdlc1 %[t1], 0x0f(%[dst_argb]) \n\t" // s7 s6 s5 s4->s7 s7 s6 s6 s5 s5 s4 s4 "punpckhbh %[t0], %[t2], %[t2] \n\t" // s5 s5 s4 s4->s5 s5 s5 s5 s4 s4 s4 s4 "punpcklbh %[t1], %[t0], %[t0] \n\t" "or %[t1], %[t1], %[c1] \n\t" "gssdrc1 %[t1], 0x10(%[dst_argb]) \n\t" "gssdlc1 %[t1], 0x17(%[dst_argb]) \n\t" // s7 s7 s6 s6->s7 s7 s7 s7 s6 s6 s6 s6 "punpckhbh %[t1], %[t0], %[t0] \n\t" "or %[t1], %[t1], %[c1] \n\t" "gssdrc1 %[t1], 0x18(%[dst_argb]) \n\t" "gssdlc1 %[t1], 0x1f(%[dst_argb]) \n\t" "daddiu %[dst_argb], %[dst_argb], 32 \n\t" "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" "daddiu %[src_sobely], %[src_sobely], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) : "memory"); } void SobelToPlaneRow_MMI(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { uint64_t tr = 0; uint64_t tb = 0; __asm__ volatile( "1: \n\t" "gsldrc1 %[tr], 0x0(%[src_sobelx]) \n\t" "gsldlc1 %[tr], 0x7(%[src_sobelx]) \n\t" // r=src_sobelx[i] "gsldrc1 %[tb], 0x0(%[src_sobely]) \n\t" "gsldlc1 %[tb], 0x7(%[src_sobely]) \n\t" // b=src_sobely[i] "paddusb %[tr], %[tr], %[tb] \n\t" // g "gssdrc1 %[tr], 0x0(%[dst_y]) \n\t" "gssdlc1 %[tr], 0x7(%[dst_y]) \n\t" "daddiu %[dst_y], %[dst_y], 8 \n\t" "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" "daddiu %[src_sobely], %[src_sobely], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [tr] "=&f"(tr), [tb] "=&f"(tb) : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), [dst_y] "r"(dst_y), [width] "r"(width) : "memory"); } void SobelXYRow_MMI(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { uint64_t temp[3]; uint64_t result = 0; uint64_t gb = 0; uint64_t cr = 0; uint64_t c1 = 0xffffffffffffffff; __asm__ volatile( "1: \n\t" "gsldlc1 %[tr], 0x07(%[src_sobelx]) \n\t" // r=src_sobelx[i] "gsldrc1 %[tr], 0x00(%[src_sobelx]) \n\t" "gsldlc1 %[tb], 0x07(%[src_sobely]) \n\t" // b=src_sobely[i] "gsldrc1 %[tb], 0x00(%[src_sobely]) \n\t" "paddusb %[tg] , %[tr], %[tb] \n\t" // g // g3 b3 g2 b2 g1 b1 g0 b0 "punpcklbh %[gb], %[tb], %[tg] \n\t" // c3 r3 r2 r2 c1 r1 c0 r0 "punpcklbh %[cr], %[tr], %[c1] \n\t" // c1 r1 g1 b1 c0 r0 g0 b0 "punpcklhw %[result], %[gb], %[cr] \n\t" "gssdrc1 %[result], 0x00(%[dst_argb]) \n\t" "gssdlc1 %[result], 0x07(%[dst_argb]) \n\t" // c3 r3 g3 b3 c2 r2 g2 b2 "punpckhhw %[result], %[gb], %[cr] \n\t" "gssdrc1 %[result], 0x08(%[dst_argb]) \n\t" "gssdlc1 %[result], 0x0f(%[dst_argb]) \n\t" // g7 b7 g6 b6 g5 b5 g4 b4 "punpckhbh %[gb], %[tb], %[tg] \n\t" // c7 r7 c6 r6 c5 r5 c4 r4 "punpckhbh %[cr], %[tr], %[c1] \n\t" // c5 r5 g5 b5 c4 r4 g4 b4 "punpcklhw %[result], %[gb], %[cr] \n\t" "gssdrc1 %[result], 0x10(%[dst_argb]) \n\t" "gssdlc1 %[result], 0x17(%[dst_argb]) \n\t" // c7 r7 g7 b7 c6 r6 g6 b6 "punpckhhw %[result], %[gb], %[cr] \n\t" "gssdrc1 %[result], 0x18(%[dst_argb]) \n\t" "gssdlc1 %[result], 0x1f(%[dst_argb]) \n\t" "daddiu %[dst_argb], %[dst_argb], 32 \n\t" "daddiu %[src_sobelx], %[src_sobelx], 8 \n\t" "daddiu %[src_sobely], %[src_sobely], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [tr] "=&f"(temp[0]), [tb] "=&f"(temp[1]), [tg] "=&f"(temp[2]), [gb] "=&f"(gb), [cr] "=&f"(cr), [result] "=&f"(result) : [src_sobelx] "r"(src_sobelx), [src_sobely] "r"(src_sobely), [dst_argb] "r"(dst_argb), [width] "r"(width), [c1] "f"(c1) : "memory"); } void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) { // Copy a Y to RGB. uint64_t src, dest; const uint64_t mask0 = 0x00ffffff00ffffffULL; const uint64_t mask1 = ~mask0; __asm__ volatile( "1: \n\t" "gslwlc1 %[src], 0x03(%[src_ptr]) \n\t" "gslwrc1 %[src], 0x00(%[src_ptr]) \n\t" "punpcklbh %[src], %[src], %[src] \n\t" "punpcklhw %[dest], %[src], %[src] \n\t" "and %[dest], %[dest], %[mask0] \n\t" "or %[dest], %[dest], %[mask1] \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "punpckhhw %[dest], %[src], %[src] \n\t" "and %[dest], %[dest], %[mask0] \n\t" "or %[dest], %[dest], %[mask1] \n\t" "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [dest] "=&f"(dest) : [src_ptr] "r"(src_y), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), [width] "r"(width) : "memory"); } // TODO - respect YuvConstants void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants*, int width) { uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x55; const uint64_t mask2 = 0xAA; const uint64_t mask3 = 0xFF; const uint64_t mask4 = 0x4A354A354A354A35ULL; const uint64_t mask5 = 0x0488048804880488ULL; const uint64_t shift0 = 0x08; const uint64_t shift1 = 0x06; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "punpcklbh %[src_lo], %[src], %[mask0] \n\t" "punpckhbh %[src_hi], %[src], %[mask0] \n\t" "pshufh %[src], %[src_lo], %[mask0] \n\t" "psllh %[dest_lo], %[src], %[shift0] \n\t" "paddush %[dest_lo], %[dest_lo], %[src] \n\t" "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" "pshufh %[src], %[src_lo], %[mask1] \n\t" "psllh %[dest_hi], %[src], %[shift0] \n\t" "paddush %[dest_hi], %[dest_hi], %[src] \n\t" "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "pshufh %[src], %[src_lo], %[mask2] \n\t" "psllh %[dest_lo], %[src], %[shift0] \n\t" "paddush %[dest_lo], %[dest_lo], %[src] \n\t" "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" "pshufh %[src], %[src_lo], %[mask3] \n\t" "psllh %[dest_hi], %[src], %[shift0] \n\t" "paddush %[dest_hi], %[dest_hi], %[src] \n\t" "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" "pshufh %[src], %[src_hi], %[mask0] \n\t" "psllh %[dest_lo], %[src], %[shift0] \n\t" "paddush %[dest_lo], %[dest_lo], %[src] \n\t" "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" "pshufh %[src], %[src_hi], %[mask1] \n\t" "psllh %[dest_hi], %[src], %[shift0] \n\t" "paddush %[dest_hi], %[dest_hi], %[src] \n\t" "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" "pshufh %[src], %[src_hi], %[mask2] \n\t" "psllh %[dest_lo], %[src], %[shift0] \n\t" "paddush %[dest_lo], %[dest_lo], %[src] \n\t" "pmulhuh %[dest_lo], %[dest_lo], %[mask4] \n\t" "psubh %[dest_lo], %[dest_lo], %[mask5] \n\t" "psrah %[dest_lo], %[dest_lo], %[shift1] \n\t" "pinsrh_3 %[dest_lo], %[dest_lo], %[mask3] \n\t" "pshufh %[src], %[src_hi], %[mask3] \n\t" "psllh %[dest_hi], %[src], %[shift0] \n\t" "paddush %[dest_hi], %[dest_hi], %[src] \n\t" "pmulhuh %[dest_hi], %[dest_hi], %[mask4] \n\t" "psubh %[dest_hi], %[dest_hi], %[mask5] \n\t" "psrah %[dest_hi], %[dest_hi], %[shift1] \n\t" "pinsrh_3 %[dest_hi], %[dest_hi], %[mask3] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" "daddi %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo) : [src_ptr] "r"(src_y), [dst_ptr] "r"(rgb_buf), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), [mask5] "f"(mask5), [shift0] "f"(shift0), [shift1] "f"(shift1), [width] "r"(width) : "memory"); } void MirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { uint64_t source, src0, src1, dest; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x1b; src += width - 1; __asm__ volatile( "1: \n\t" "gsldlc1 %[source], 0(%[src_ptr]) \n\t" "gsldrc1 %[source], -7(%[src_ptr]) \n\t" "punpcklbh %[src0], %[source], %[mask0] \n\t" "pshufh %[src0], %[src0], %[mask1] \n\t" "punpckhbh %[src1], %[source], %[mask0] \n\t" "pshufh %[src1], %[src1], %[mask1] \n\t" "packushb %[dest], %[src1], %[src0] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddi %[src_ptr], %[src_ptr], -0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [source] "=&f"(source), [dest] "=&f"(dest), [src0] "=&f"(src0), [src1] "=&f"(src1) : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1), [width] "r"(width) : "memory"); } void MirrorSplitUVRow_MMI(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t src0, src1, dest0, dest1; const uint64_t mask0 = 0x00ff00ff00ff00ffULL; const uint64_t mask1 = 0x1b; const uint64_t shift = 0x08; src_uv += (width - 1) << 1; __asm__ volatile( "1: \n\t" "gsldlc1 %[src0], 1(%[src_ptr]) \n\t" "gsldrc1 %[src0], -6(%[src_ptr]) \n\t" "gsldlc1 %[src1], -7(%[src_ptr]) \n\t" "gsldrc1 %[src1], -14(%[src_ptr]) \n\t" "and %[dest0], %[src0], %[mask0] \n\t" "pshufh %[dest0], %[dest0], %[mask1] \n\t" "and %[dest1], %[src1], %[mask0] \n\t" "pshufh %[dest1], %[dest1], %[mask1] \n\t" "packushb %[dest0], %[dest0], %[dest1] \n\t" "gssdlc1 %[dest0], 0x07(%[dstu_ptr]) \n\t" "gssdrc1 %[dest0], 0x00(%[dstu_ptr]) \n\t" "psrlh %[dest0], %[src0], %[shift] \n\t" "pshufh %[dest0], %[dest0], %[mask1] \n\t" "psrlh %[dest1], %[src1], %[shift] \n\t" "pshufh %[dest1], %[dest1], %[mask1] \n\t" "packushb %[dest0], %[dest0], %[dest1] \n\t" "gssdlc1 %[dest0], 0x07(%[dstv_ptr]) \n\t" "gssdrc1 %[dest0], 0x00(%[dstv_ptr]) \n\t" "daddi %[src_ptr], %[src_ptr], -0x10 \n\t" "daddiu %[dstu_ptr], %[dstu_ptr], 0x08 \n\t" "daddiu %[dstv_ptr], %[dstv_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), [src1] "=&f"(src1) : [src_ptr] "r"(src_uv), [dstu_ptr] "r"(dst_u), [dstv_ptr] "r"(dst_v), [width] "r"(width), [mask0] "f"(mask0), [mask1] "f"(mask1), [shift] "f"(shift) : "memory"); } void ARGBMirrorRow_MMI(const uint8_t* src, uint8_t* dst, int width) { src += (width - 1) * 4; uint64_t temp = 0x0; uint64_t shuff = 0x4e; // 01 00 11 10 __asm__ volatile( "1: \n\t" "gsldlc1 %[temp], 3(%[src]) \n\t" "gsldrc1 %[temp], -4(%[src]) \n\t" "pshufh %[temp], %[temp], %[shuff] \n\t" "gssdrc1 %[temp], 0x0(%[dst]) \n\t" "gssdlc1 %[temp], 0x7(%[dst]) \n\t" "daddiu %[src], %[src], -0x08 \n\t" "daddiu %[dst], %[dst], 0x08 \n\t" "daddiu %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [temp] "=&f"(temp) : [src] "r"(src), [dst] "r"(dst), [width] "r"(width), [shuff] "f"(shuff) : "memory"); } void SplitUVRow_MMI(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t c0 = 0x00ff00ff00ff00ff; uint64_t temp[4]; uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" "gsldrc1 %[t0], 0x00(%[src_uv]) \n\t" "gsldlc1 %[t0], 0x07(%[src_uv]) \n\t" "gsldrc1 %[t1], 0x08(%[src_uv]) \n\t" "gsldlc1 %[t1], 0x0f(%[src_uv]) \n\t" "and %[t2], %[t0], %[c0] \n\t" "and %[t3], %[t1], %[c0] \n\t" "packushb %[t2], %[t2], %[t3] \n\t" "gssdrc1 %[t2], 0x0(%[dst_u]) \n\t" "gssdlc1 %[t2], 0x7(%[dst_u]) \n\t" "psrlh %[t2], %[t0], %[shift] \n\t" "psrlh %[t3], %[t1], %[shift] \n\t" "packushb %[t2], %[t2], %[t3] \n\t" "gssdrc1 %[t2], 0x0(%[dst_v]) \n\t" "gssdlc1 %[t2], 0x7(%[dst_v]) \n\t" "daddiu %[src_uv], %[src_uv], 16 \n\t" "daddiu %[dst_u], %[dst_u], 8 \n\t" "daddiu %[dst_v], %[dst_v], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), [t3] "=&f"(temp[3]) : [src_uv] "r"(src_uv), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) : "memory"); } void MergeUVRow_MMI(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { uint64_t temp[3]; __asm__ volatile( "1: \n\t" "gsldrc1 %[t0], 0x0(%[src_u]) \n\t" "gsldlc1 %[t0], 0x7(%[src_u]) \n\t" "gsldrc1 %[t1], 0x0(%[src_v]) \n\t" "gsldlc1 %[t1], 0x7(%[src_v]) \n\t" "punpcklbh %[t2], %[t0], %[t1] \n\t" "gssdrc1 %[t2], 0x0(%[dst_uv]) \n\t" "gssdlc1 %[t2], 0x7(%[dst_uv]) \n\t" "punpckhbh %[t2], %[t0], %[t1] \n\t" "gssdrc1 %[t2], 0x8(%[dst_uv]) \n\t" "gssdlc1 %[t2], 0xf(%[dst_uv]) \n\t" "daddiu %[src_u], %[src_u], 8 \n\t" "daddiu %[src_v], %[src_v], 8 \n\t" "daddiu %[dst_uv], %[dst_uv], 16 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]) : [dst_uv] "r"(dst_uv), [src_u] "r"(src_u), [src_v] "r"(src_v), [width] "r"(width) : "memory"); } void SplitRGBRow_MMI(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { uint64_t src[4]; uint64_t dest_hi, dest_lo, dest; __asm__ volatile( "1: \n\t" "gslwlc1 %[src0], 0x03(%[src_ptr]) \n\t" "gslwrc1 %[src0], 0x00(%[src_ptr]) \n\t" "gslwlc1 %[src1], 0x06(%[src_ptr]) \n\t" "gslwrc1 %[src1], 0x03(%[src_ptr]) \n\t" "punpcklbh %[dest_lo], %[src0], %[src1] \n\t" "gslwlc1 %[src2], 0x09(%[src_ptr]) \n\t" "gslwrc1 %[src2], 0x06(%[src_ptr]) \n\t" "gslwlc1 %[src3], 0x0c(%[src_ptr]) \n\t" "gslwrc1 %[src3], 0x09(%[src_ptr]) \n\t" "punpcklbh %[dest_hi], %[src2], %[src3] \n\t" "punpcklhw %[dest], %[dest_lo], %[dest_hi] \n\t" "gsswlc1 %[dest], 0x03(%[dstr_ptr]) \n\t" "gsswrc1 %[dest], 0x00(%[dstr_ptr]) \n\t" "punpckhwd %[dest], %[dest], %[dest] \n\t" "gsswlc1 %[dest], 0x03(%[dstg_ptr]) \n\t" "gsswrc1 %[dest], 0x00(%[dstg_ptr]) \n\t" "punpckhhw %[dest], %[dest_lo], %[dest_hi] \n\t" "gsswlc1 %[dest], 0x03(%[dstb_ptr]) \n\t" "gsswrc1 %[dest], 0x00(%[dstb_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x0c \n\t" "daddiu %[dstr_ptr], %[dstr_ptr], 0x04 \n\t" "daddiu %[dstg_ptr], %[dstg_ptr], 0x04 \n\t" "daddiu %[dstb_ptr], %[dstb_ptr], 0x04 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src[0]), [src1] "=&f"(src[1]), [src2] "=&f"(src[2]), [src3] "=&f"(src[3]), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) : [src_ptr] "r"(src_rgb), [dstr_ptr] "r"(dst_r), [dstg_ptr] "r"(dst_g), [dstb_ptr] "r"(dst_b), [width] "r"(width) : "memory"); } void MergeRGBRow_MMI(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width) { uint64_t srcr, srcg, srcb, dest; uint64_t srcrg_hi, srcrg_lo, srcbz_hi, srcbz_lo; const uint64_t temp = 0x0; __asm__ volatile( "1: \n\t" "gsldlc1 %[srcr], 0x07(%[srcr_ptr]) \n\t" "gsldrc1 %[srcr], 0x00(%[srcr_ptr]) \n\t" "gsldlc1 %[srcg], 0x07(%[srcg_ptr]) \n\t" "gsldrc1 %[srcg], 0x00(%[srcg_ptr]) \n\t" "punpcklbh %[srcrg_lo], %[srcr], %[srcg] \n\t" "punpckhbh %[srcrg_hi], %[srcr], %[srcg] \n\t" "gsldlc1 %[srcb], 0x07(%[srcb_ptr]) \n\t" "gsldrc1 %[srcb], 0x00(%[srcb_ptr]) \n\t" "punpcklbh %[srcbz_lo], %[srcb], %[temp] \n\t" "punpckhbh %[srcbz_hi], %[srcb], %[temp] \n\t" "punpcklhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" "gsswlc1 %[dest], 0x03(%[dst_ptr]) \n\t" "gsswrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "punpckhwd %[dest], %[dest], %[dest] \n\t" "gsswlc1 %[dest], 0x06(%[dst_ptr]) \n\t" "gsswrc1 %[dest], 0x03(%[dst_ptr]) \n\t" "punpckhhw %[dest], %[srcrg_lo], %[srcbz_lo] \n\t" "gsswlc1 %[dest], 0x09(%[dst_ptr]) \n\t" "gsswrc1 %[dest], 0x06(%[dst_ptr]) \n\t" "punpckhwd %[dest], %[dest], %[dest] \n\t" "gsswlc1 %[dest], 0x0c(%[dst_ptr]) \n\t" "gsswrc1 %[dest], 0x09(%[dst_ptr]) \n\t" "punpcklhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" "gsswlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" "gsswrc1 %[dest], 0x0c(%[dst_ptr]) \n\t" "punpckhwd %[dest], %[dest], %[dest] \n\t" "gsswlc1 %[dest], 0x12(%[dst_ptr]) \n\t" "gsswrc1 %[dest], 0x0f(%[dst_ptr]) \n\t" "punpckhhw %[dest], %[srcrg_hi], %[srcbz_hi] \n\t" "gsswlc1 %[dest], 0x15(%[dst_ptr]) \n\t" "gsswrc1 %[dest], 0x12(%[dst_ptr]) \n\t" "punpckhwd %[dest], %[dest], %[dest] \n\t" "gsswlc1 %[dest], 0x18(%[dst_ptr]) \n\t" "gsswrc1 %[dest], 0x15(%[dst_ptr]) \n\t" "daddiu %[srcr_ptr], %[srcr_ptr], 0x08 \n\t" "daddiu %[srcg_ptr], %[srcg_ptr], 0x08 \n\t" "daddiu %[srcb_ptr], %[srcb_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x18 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [srcr] "=&f"(srcr), [srcg] "=&f"(srcg), [srcb] "=&f"(srcb), [dest] "=&f"(dest), [srcrg_hi] "=&f"(srcrg_hi), [srcrg_lo] "=&f"(srcrg_lo), [srcbz_hi] "=&f"(srcbz_hi), [srcbz_lo] "=&f"(srcbz_lo) : [srcr_ptr] "r"(src_r), [srcg_ptr] "r"(src_g), [srcb_ptr] "r"(src_b), [dst_ptr] "r"(dst_rgb), [width] "r"(width), [temp] "f"(temp) : "memory"); } // Filter 2 rows of YUY2 UV's (422) into U and V (420). void YUY2ToUVRow_MMI(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t c0 = 0xff00ff00ff00ff00; uint64_t c1 = 0x00ff00ff00ff00ff; uint64_t temp[3]; uint64_t data[4]; uint64_t shift = 0x08; uint64_t src_stride = 0x0; __asm__ volatile( "1: \n\t" "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" "daddu %[src_stride], %[src_yuy2], %[src_stride_yuy2] \n\t" "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" "pavgb %[t0], %[t0], %[t1] \n\t" "gsldrc1 %[t2], 0x08(%[src_yuy2]) \n\t" "gsldlc1 %[t2], 0x0f(%[src_yuy2]) \n\t" "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" "pavgb %[t1], %[t2], %[t1] \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "psrlh %[t0], %[t0], %[shift] \n\t" "psrlh %[t1], %[t1], %[shift] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" "mov.s %[t1], %[t0] \n\t" "and %[d0], %[t0], %[c1] \n\t" "psrlh %[d1], %[t1], %[shift] \n\t" "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" "pavgb %[t0], %[t0], %[t1] \n\t" "gsldrc1 %[t2], 0x18(%[src_yuy2]) \n\t" "gsldlc1 %[t2], 0x1f(%[src_yuy2]) \n\t" "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" "pavgb %[t1], %[t2], %[t1] \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "psrlh %[t0], %[t0], %[shift] \n\t" "psrlh %[t1], %[t1], %[shift] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" "mov.s %[t1], %[t0] \n\t" "and %[d2], %[t0], %[c1] \n\t" "psrlh %[d3], %[t1], %[shift] \n\t" "packushb %[d0], %[d0], %[d2] \n\t" "packushb %[d1], %[d1], %[d3] \n\t" "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" "daddiu %[dst_u], %[dst_u], 8 \n\t" "daddiu %[dst_v], %[dst_v], 8 \n\t" "daddiu %[width], %[width], -16 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) : [src_yuy2] "r"(src_yuy2), [src_stride_yuy2] "r"(src_stride_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) : "memory"); } // Copy row of YUY2 UV's (422) into U and V (422). void YUY2ToUV422Row_MMI(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { uint64_t c0 = 0xff00ff00ff00ff00; uint64_t c1 = 0x00ff00ff00ff00ff; uint64_t temp[2]; uint64_t data[4]; uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "psrlh %[t0], %[t0], %[shift] \n\t" "psrlh %[t1], %[t1], %[shift] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" "mov.s %[t1], %[t0] \n\t" "and %[d0], %[t0], %[c1] \n\t" "psrlh %[d1], %[t1], %[shift] \n\t" "gsldrc1 %[t0], 0x10(%[src_yuy2]) \n\t" "gsldlc1 %[t0], 0x17(%[src_yuy2]) \n\t" "gsldrc1 %[t1], 0x18(%[src_yuy2]) \n\t" "gsldlc1 %[t1], 0x1f(%[src_yuy2]) \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "psrlh %[t0], %[t0], %[shift] \n\t" "psrlh %[t1], %[t1], %[shift] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" "mov.s %[t1], %[t0] \n\t" "and %[d2], %[t0], %[c1] \n\t" "psrlh %[d3], %[t1], %[shift] \n\t" "packushb %[d0], %[d0], %[d2] \n\t" "packushb %[d1], %[d1], %[d3] \n\t" "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" "daddiu %[src_yuy2], %[src_yuy2], 32 \n\t" "daddiu %[dst_u], %[dst_u], 8 \n\t" "daddiu %[dst_v], %[dst_v], 8 \n\t" "daddiu %[width], %[width], -16 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) : [src_yuy2] "r"(src_yuy2), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [c0] "f"(c0), [c1] "f"(c1), [shift] "f"(shift) : "memory"); } // Copy row of YUY2 Y's (422) into Y (420/422). void YUY2ToYRow_MMI(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { uint64_t c0 = 0x00ff00ff00ff00ff; uint64_t temp[2]; __asm__ volatile( "1: \n\t" "gsldrc1 %[t0], 0x00(%[src_yuy2]) \n\t" "gsldlc1 %[t0], 0x07(%[src_yuy2]) \n\t" "gsldrc1 %[t1], 0x08(%[src_yuy2]) \n\t" "gsldlc1 %[t1], 0x0f(%[src_yuy2]) \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" "daddiu %[src_yuy2], %[src_yuy2], 16 \n\t" "daddiu %[dst_y], %[dst_y], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) : [src_yuy2] "r"(src_yuy2), [dst_y] "r"(dst_y), [width] "r"(width), [c0] "f"(c0) : "memory"); } // Filter 2 rows of UYVY UV's (422) into U and V (420). void UYVYToUVRow_MMI(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { // Output a row of UV values. uint64_t c0 = 0x00ff00ff00ff00ff; uint64_t temp[3]; uint64_t data[4]; uint64_t shift = 0x08; uint64_t src_stride = 0x0; __asm__ volatile( "1: \n\t" "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" "daddu %[src_stride], %[src_uyvy], %[src_stride_uyvy] \n\t" "gsldrc1 %[t1], 0x00(%[src_stride]) \n\t" "gsldlc1 %[t1], 0x07(%[src_stride]) \n\t" "pavgb %[t0], %[t0], %[t1] \n\t" "gsldrc1 %[t2], 0x08(%[src_uyvy]) \n\t" "gsldlc1 %[t2], 0x0f(%[src_uyvy]) \n\t" "gsldrc1 %[t1], 0x08(%[src_stride]) \n\t" "gsldlc1 %[t1], 0x0f(%[src_stride]) \n\t" "pavgb %[t1], %[t2], %[t1] \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" "mov.s %[t1], %[t0] \n\t" "and %[d0], %[t0], %[c0] \n\t" "psrlh %[d1], %[t1], %[shift] \n\t" "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" "gsldrc1 %[t1], 0x10(%[src_stride]) \n\t" "gsldlc1 %[t1], 0x17(%[src_stride]) \n\t" "pavgb %[t0], %[t0], %[t1] \n\t" "gsldrc1 %[t2], 0x18(%[src_uyvy]) \n\t" "gsldlc1 %[t2], 0x1f(%[src_uyvy]) \n\t" "gsldrc1 %[t1], 0x18(%[src_stride]) \n\t" "gsldlc1 %[t1], 0x1f(%[src_stride]) \n\t" "pavgb %[t1], %[t2], %[t1] \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" "mov.s %[t1], %[t0] \n\t" "and %[d2], %[t0], %[c0] \n\t" "psrlh %[d3], %[t1], %[shift] \n\t" "packushb %[d0], %[d0], %[d2] \n\t" "packushb %[d1], %[d1], %[d3] \n\t" "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" "daddiu %[dst_u], %[dst_u], 8 \n\t" "daddiu %[dst_v], %[dst_v], 8 \n\t" "daddiu %[width], %[width], -16 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [t2] "=&f"(temp[2]), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]), [src_stride] "=&r"(src_stride) : [src_uyvy] "r"(src_uyvy), [src_stride_uyvy] "r"(src_stride_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) : "memory"); } // Copy row of UYVY UV's (422) into U and V (422). void UYVYToUV422Row_MMI(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { // Output a row of UV values. uint64_t c0 = 0x00ff00ff00ff00ff; uint64_t temp[2]; uint64_t data[4]; uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" "mov.s %[t1], %[t0] \n\t" "and %[d0], %[t0], %[c0] \n\t" "psrlh %[d1], %[t1], %[shift] \n\t" "gsldrc1 %[t0], 0x10(%[src_uyvy]) \n\t" "gsldlc1 %[t0], 0x17(%[src_uyvy]) \n\t" "gsldrc1 %[t1], 0x18(%[src_uyvy]) \n\t" "gsldlc1 %[t1], 0x1f(%[src_uyvy]) \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" "mov.s %[t1], %[t0] \n\t" "and %[d2], %[t0], %[c0] \n\t" "psrlh %[d3], %[t1], %[shift] \n\t" "packushb %[d0], %[d0], %[d2] \n\t" "packushb %[d1], %[d1], %[d3] \n\t" "gssdrc1 %[d0], 0x0(%[dst_u]) \n\t" "gssdlc1 %[d0], 0x7(%[dst_u]) \n\t" "gssdrc1 %[d1], 0x0(%[dst_v]) \n\t" "gssdlc1 %[d1], 0x7(%[dst_v]) \n\t" "daddiu %[src_uyvy], %[src_uyvy], 32 \n\t" "daddiu %[dst_u], %[dst_u], 8 \n\t" "daddiu %[dst_v], %[dst_v], 8 \n\t" "daddiu %[width], %[width], -16 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) : [src_uyvy] "r"(src_uyvy), [dst_u] "r"(dst_u), [dst_v] "r"(dst_v), [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) : "memory"); } // Copy row of UYVY Y's (422) into Y (420/422). void UYVYToYRow_MMI(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { // Output a row of Y values. uint64_t c0 = 0x00ff00ff00ff00ff; uint64_t shift = 0x08; uint64_t temp[2]; __asm__ volatile( "1: \n\t" "gsldrc1 %[t0], 0x00(%[src_uyvy]) \n\t" "gsldlc1 %[t0], 0x07(%[src_uyvy]) \n\t" "gsldrc1 %[t1], 0x08(%[src_uyvy]) \n\t" "gsldlc1 %[t1], 0x0f(%[src_uyvy]) \n\t" "dsrl %[t0], %[t0], %[shift] \n\t" "dsrl %[t1], %[t1], %[shift] \n\t" "and %[t0], %[t0], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "and %[t1], %[t1], %[c0] \n\t" "packushb %[t0], %[t0], %[t1] \n\t" "gssdrc1 %[t0], 0x0(%[dst_y]) \n\t" "gssdlc1 %[t0], 0x7(%[dst_y]) \n\t" "daddiu %[src_uyvy], %[src_uyvy], 16 \n\t" "daddiu %[dst_y], %[dst_y], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [t0] "=&f"(temp[0]), [t1] "=&f"(temp[1]) : [src_uyvy] "r"(src_uyvy), [dst_y] "r"(dst_y), [width] "r"(width), [c0] "f"(c0), [shift] "f"(shift) : "memory"); } // Blend src_argb over src_argb1 and store to dst_argb. // dst_argb may be src_argb or src_argb1. // This code mimics the SSSE3 version for better testability. void ARGBBlendRow_MMI(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { uint64_t src0, src1, dest, alpha, src0_hi, src0_lo, src1_hi, src1_lo, dest_hi, dest_lo; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x00FFFFFF00FFFFFFULL; const uint64_t mask2 = 0x00FF00FF00FF00FFULL; const uint64_t mask3 = 0xFF; const uint64_t mask4 = ~mask1; const uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" "psubush %[alpha], %[mask2], %[src0_lo] \n\t" "pshufh %[alpha], %[alpha], %[mask3] \n\t" "pmullh %[dest_lo], %[src1_lo], %[alpha] \n\t" "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" "paddush %[dest_lo], %[dest_lo], %[src0_lo] \n\t" "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" "psubush %[alpha], %[mask2], %[src0_hi] \n\t" "pshufh %[alpha], %[alpha], %[mask3] \n\t" "pmullh %[dest_hi], %[src1_hi], %[alpha] \n\t" "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" "paddush %[dest_hi], %[dest_hi], %[src0_hi] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "and %[dest], %[dest], %[mask1] \n\t" "or %[dest], %[dest], %[mask4] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [alpha] "=&f"(alpha), [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo) : [src0_ptr] "r"(src_argb), [src1_ptr] "r"(src_argb1), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), [mask4] "f"(mask4), [shift] "f"(shift), [width] "r"(width) : "memory"); } void BlendPlaneRow_MMI(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width) { uint64_t source0, source1, dest, alph; uint64_t src0_hi, src0_lo, src1_hi, src1_lo, alpha_hi, alpha_lo, dest_hi, dest_lo; uint64_t alpha_rev, alpha_rev_lo, alpha_rev_hi; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0xFFFFFFFFFFFFFFFFULL; const uint64_t mask2 = 0x00FF00FF00FF00FFULL; const uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" "gsldlc1 %[src0], 0x07(%[src0_ptr]) \n\t" "gsldrc1 %[src0], 0x00(%[src0_ptr]) \n\t" "punpcklbh %[src0_lo], %[src0], %[mask0] \n\t" "punpckhbh %[src0_hi], %[src0], %[mask0] \n\t" "gsldlc1 %[src1], 0x07(%[src1_ptr]) \n\t" "gsldrc1 %[src1], 0x00(%[src1_ptr]) \n\t" "punpcklbh %[src1_lo], %[src1], %[mask0] \n\t" "punpckhbh %[src1_hi], %[src1], %[mask0] \n\t" "gsldlc1 %[alpha], 0x07(%[alpha_ptr]) \n\t" "gsldrc1 %[alpha], 0x00(%[alpha_ptr]) \n\t" "psubusb %[alpha_r], %[mask1], %[alpha] \n\t" "punpcklbh %[alpha_lo], %[alpha], %[mask0] \n\t" "punpckhbh %[alpha_hi], %[alpha], %[mask0] \n\t" "punpcklbh %[alpha_rlo], %[alpha_r], %[mask0] \n\t" "punpckhbh %[alpha_rhi], %[alpha_r], %[mask0] \n\t" "pmullh %[dest_lo], %[src0_lo], %[alpha_lo] \n\t" "pmullh %[dest], %[src1_lo], %[alpha_rlo] \n\t" "paddush %[dest_lo], %[dest_lo], %[dest] \n\t" "paddush %[dest_lo], %[dest_lo], %[mask2] \n\t" "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" "pmullh %[dest_hi], %[src0_hi], %[alpha_hi] \n\t" "pmullh %[dest], %[src1_hi], %[alpha_rhi] \n\t" "paddush %[dest_hi], %[dest_hi], %[dest] \n\t" "paddush %[dest_hi], %[dest_hi], %[mask2] \n\t" "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" "daddiu %[alpha_ptr], %[alpha_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(source0), [src1] "=&f"(source1), [alpha] "=&f"(alph), [dest] "=&f"(dest), [src0_hi] "=&f"(src0_hi), [src0_lo] "=&f"(src0_lo), [src1_hi] "=&f"(src1_hi), [src1_lo] "=&f"(src1_lo), [alpha_hi] "=&f"(alpha_hi), [alpha_lo] "=&f"(alpha_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [alpha_rlo] "=&f"(alpha_rev_lo), [alpha_rhi] "=&f"(alpha_rev_hi), [alpha_r] "=&f"(alpha_rev) : [src0_ptr] "r"(src0), [src1_ptr] "r"(src1), [alpha_ptr] "r"(alpha), [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width) : "memory"); } // Multiply source RGB by alpha and store to destination. // This code mimics the SSSE3 version for better testability. void ARGBAttenuateRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, int width) { uint64_t src, src_hi, src_lo, dest, dest_hi, dest_lo, alpha; const uint64_t mask0 = 0xFF; const uint64_t mask1 = 0xFF000000FF000000ULL; const uint64_t mask2 = ~mask1; const uint64_t shift = 0x08; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "punpcklbh %[src_lo], %[src], %[src] \n\t" "punpckhbh %[src_hi], %[src], %[src] \n\t" "pshufh %[alpha], %[src_lo], %[mask0] \n\t" "pmulhuh %[dest_lo], %[alpha], %[src_lo] \n\t" "psrlh %[dest_lo], %[dest_lo], %[shift] \n\t" "pshufh %[alpha], %[src_hi], %[mask0] \n\t" "pmulhuh %[dest_hi], %[alpha], %[src_hi] \n\t" "psrlh %[dest_hi], %[dest_hi], %[shift] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "and %[dest], %[dest], %[mask2] \n\t" "and %[src], %[src], %[mask1] \n\t" "or %[dest], %[dest], %[src] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [dest] "=&f"(dest), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [alpha] "=&f"(alpha) : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [shift] "f"(shift), [width] "r"(width) : "memory"); } void ComputeCumulativeSumRow_MMI(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width) { int64_t row_sum[2] = {0, 0}; uint64_t src, dest0, dest1, presrc0, presrc1, dest; const uint64_t mask = 0x0; __asm__ volatile( "xor %[row_sum0], %[row_sum0], %[row_sum0] \n\t" "xor %[row_sum1], %[row_sum1], %[row_sum1] \n\t" "1: \n\t" "gslwlc1 %[src], 0x03(%[row_ptr]) \n\t" "gslwrc1 %[src], 0x00(%[row_ptr]) \n\t" "punpcklbh %[src], %[src], %[mask] \n\t" "punpcklhw %[dest0], %[src], %[mask] \n\t" "punpckhhw %[dest1], %[src], %[mask] \n\t" "paddw %[row_sum0], %[row_sum0], %[dest0] \n\t" "paddw %[row_sum1], %[row_sum1], %[dest1] \n\t" "gsldlc1 %[presrc0], 0x07(%[pre_ptr]) \n\t" "gsldrc1 %[presrc0], 0x00(%[pre_ptr]) \n\t" "gsldlc1 %[presrc1], 0x0f(%[pre_ptr]) \n\t" "gsldrc1 %[presrc1], 0x08(%[pre_ptr]) \n\t" "paddw %[dest0], %[row_sum0], %[presrc0] \n\t" "paddw %[dest1], %[row_sum1], %[presrc1] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" "daddiu %[row_ptr], %[row_ptr], 0x04 \n\t" "daddiu %[pre_ptr], %[pre_ptr], 0x10 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x01 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [row_sum0] "+&f"(row_sum[0]), [row_sum1] "+&f"(row_sum[1]), [presrc0] "=&f"(presrc0), [presrc1] "=&f"(presrc1) : [row_ptr] "r"(row), [pre_ptr] "r"(previous_cumsum), [dst_ptr] "r"(cumsum), [width] "r"(width), [mask] "f"(mask) : "memory"); } // C version 2x2 -> 2x1. void InterpolateRow_MMI(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int width, int source_y_fraction) { if (source_y_fraction == 0) { __asm__ volatile( "1: \n\t" "ld $t0, 0x0(%[src_ptr]) \n\t" "sd $t0, 0x0(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 8 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), [width] "r"(width) : "memory"); return; } if (source_y_fraction == 128) { uint64_t uv = 0x0; uint64_t uv_stride = 0x0; __asm__ volatile( "1: \n\t" "gsldrc1 %[uv], 0x0(%[src_ptr]) \n\t" "gsldlc1 %[uv], 0x7(%[src_ptr]) \n\t" "daddu $t0, %[src_ptr], %[stride] \n\t" "gsldrc1 %[uv_stride], 0x0($t0) \n\t" "gsldlc1 %[uv_stride], 0x7($t0) \n\t" "pavgb %[uv], %[uv], %[uv_stride] \n\t" "gssdrc1 %[uv], 0x0(%[dst_ptr]) \n\t" "gssdlc1 %[uv], 0x7(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 8 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [uv] "=&f"(uv), [uv_stride] "=&f"(uv_stride) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(width), [stride] "r"((int64_t)src_stride) : "memory"); return; } const uint8_t* src_ptr1 = src_ptr + src_stride; uint64_t temp; uint64_t data[4]; uint64_t zero = 0x0; uint64_t c0 = 0x0080008000800080; uint64_t fy0 = 0x0100010001000100; uint64_t shift = 0x8; __asm__ volatile( "pshufh %[fy1], %[fy1], %[zero] \n\t" "psubh %[fy0], %[fy0], %[fy1] \n\t" "1: \n\t" "gsldrc1 %[t0], 0x0(%[src_ptr]) \n\t" "gsldlc1 %[t0], 0x7(%[src_ptr]) \n\t" "punpcklbh %[d0], %[t0], %[zero] \n\t" "punpckhbh %[d1], %[t0], %[zero] \n\t" "gsldrc1 %[t0], 0x0(%[src_ptr1]) \n\t" "gsldlc1 %[t0], 0x7(%[src_ptr1]) \n\t" "punpcklbh %[d2], %[t0], %[zero] \n\t" "punpckhbh %[d3], %[t0], %[zero] \n\t" "pmullh %[d0], %[d0], %[fy0] \n\t" "pmullh %[d2], %[d2], %[fy1] \n\t" "paddh %[d0], %[d0], %[d2] \n\t" "paddh %[d0], %[d0], %[c0] \n\t" "psrlh %[d0], %[d0], %[shift] \n\t" "pmullh %[d1], %[d1], %[fy0] \n\t" "pmullh %[d3], %[d3], %[fy1] \n\t" "paddh %[d1], %[d1], %[d3] \n\t" "paddh %[d1], %[d1], %[c0] \n\t" "psrlh %[d1], %[d1], %[shift] \n\t" "packushb %[d0], %[d0], %[d1] \n\t" "gssdrc1 %[d0], 0x0(%[dst_ptr]) \n\t" "gssdlc1 %[d0], 0x7(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 8 \n\t" "daddiu %[src_ptr1], %[src_ptr1], 8 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 8 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [t0] "=&f"(temp), [d0] "=&f"(data[0]), [d1] "=&f"(data[1]), [d2] "=&f"(data[2]), [d3] "=&f"(data[3]) : [src_ptr] "r"(src_ptr), [src_ptr1] "r"(src_ptr1), [dst_ptr] "r"(dst_ptr), [width] "r"(width), [fy1] "f"(source_y_fraction), [fy0] "f"(fy0), [c0] "f"(c0), [shift] "f"(shift), [zero] "f"(zero) : "memory"); } // Use first 4 shuffler values to reorder ARGB channels. void ARGBShuffleRow_MMI(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { uint64_t source, dest0, dest1, dest; const uint64_t mask0 = 0x0; const uint64_t mask1 = (shuffler[0] & 0x03) | ((shuffler[1] & 0x03) << 2) | ((shuffler[2] & 0x03) << 4) | ((shuffler[3] & 0x03) << 6); __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "punpcklbh %[dest0], %[src], %[mask0] \n\t" "pshufh %[dest0], %[dest0], %[mask1] \n\t" "punpckhbh %[dest1], %[src], %[mask0] \n\t" "pshufh %[dest1], %[dest1], %[mask1] \n\t" "packushb %[dest], %[dest0], %[dest1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1) : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [mask0] "f"(mask0), [mask1] "f"(mask1), [width] "r"(width) : "memory"); } void I422ToYUY2Row_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_frame, int width) { uint64_t temp[3]; uint64_t vu = 0x0; __asm__ volatile( "1: \n\t" "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] "punpcklbh %[vu], %[tu], %[tv] \n\t" // g "punpcklbh %[tu], %[ty], %[vu] \n\t" // g "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" "punpckhbh %[tu], %[ty], %[vu] \n\t" // g "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" "daddiu %[src_y], %[src_y], 8 \n\t" "daddiu %[src_u], %[src_u], 4 \n\t" "daddiu %[src_v], %[src_v], 4 \n\t" "daddiu %[dst_frame], %[dst_frame], 16 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), [vu] "=&f"(vu) : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), [dst_frame] "r"(dst_frame), [width] "r"(width) : "memory"); } void I422ToUYVYRow_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_frame, int width) { uint64_t temp[3]; uint64_t vu = 0x0; __asm__ volatile( "1: \n\t" "gsldlc1 %[ty], 0x7(%[src_y]) \n\t" // r=src_sobelx[i] "gsldrc1 %[ty], 0x0(%[src_y]) \n\t" // r=src_sobelx[i] "gslwlc1 %[tu], 0x3(%[src_u]) \n\t" // b=src_sobely[i] "gslwrc1 %[tu], 0x0(%[src_u]) \n\t" // b=src_sobely[i] "gslwlc1 %[tv], 0x3(%[src_v]) \n\t" // b=src_sobely[i] "gslwrc1 %[tv], 0x0(%[src_v]) \n\t" // b=src_sobely[i] "punpcklbh %[vu], %[tu], %[tv] \n\t" // g "punpcklbh %[tu], %[vu], %[ty] \n\t" // g "gssdlc1 %[tu], 0x7(%[dst_frame]) \n\t" "gssdrc1 %[tu], 0x0(%[dst_frame]) \n\t" "punpckhbh %[tu], %[vu], %[ty] \n\t" // g "gssdlc1 %[tu], 0x0F(%[dst_frame]) \n\t" "gssdrc1 %[tu], 0x08(%[dst_frame]) \n\t" "daddiu %[src_y], %[src_y], 8 \n\t" "daddiu %[src_u], %[src_u], 4 \n\t" "daddiu %[src_v], %[src_v], 4 \n\t" "daddiu %[dst_frame], %[dst_frame], 16 \n\t" "daddiu %[width], %[width], -8 \n\t" "bgtz %[width], 1b \n\t" "nop \n\t" : [ty] "=&f"(temp[1]), [tu] "=&f"(temp[1]), [tv] "=&f"(temp[1]), [vu] "=&f"(vu) : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v), [dst_frame] "r"(dst_frame), [width] "r"(width) : "memory"); } void ARGBCopyAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { uint64_t source, dest; const uint64_t mask0 = 0xff000000ff000000ULL; const uint64_t mask1 = ~mask0; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "and %[src], %[src], %[mask0] \n\t" "and %[dest], %[dest], %[mask1] \n\t" "or %[dest], %[src], %[dest] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(source), [dest] "=&f"(dest) : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1), [width] "r"(width) : "memory"); } void ARGBExtractAlphaRow_MMI(const uint8_t* src_argb, uint8_t* dst_a, int width) { uint64_t src, dest0, dest1, dest_lo, dest_hi, dest; const uint64_t mask = 0xff000000ff000000ULL; const uint64_t shift = 0x18; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "and %[dest0], %[src], %[mask] \n\t" "psrlw %[dest0], %[dest0], %[shift] \n\t" "gsldlc1 %[src], 0x0f(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x08(%[src_ptr]) \n\t" "and %[dest1], %[src], %[mask] \n\t" "psrlw %[dest1], %[dest1], %[shift] \n\t" "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" "gsldlc1 %[src], 0x17(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x10(%[src_ptr]) \n\t" "and %[dest0], %[src], %[mask] \n\t" "psrlw %[dest0], %[dest0], %[shift] \n\t" "gsldlc1 %[src], 0x1f(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x18(%[src_ptr]) \n\t" "and %[dest1], %[src], %[mask] \n\t" "psrlw %[dest1], %[dest1], %[shift] \n\t" "packsswh %[dest_hi], %[dest0], %[dest1] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [dest] "=&f"(dest), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest_lo] "=&f"(dest_lo), [dest_hi] "=&f"(dest_hi) : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_a), [mask] "f"(mask), [shift] "f"(shift), [width] "r"(width) : "memory"); } void ARGBCopyYToAlphaRow_MMI(const uint8_t* src, uint8_t* dst, int width) { uint64_t source, dest0, dest1, dest; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x00ffffff00ffffffULL; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "punpcklbh %[dest0], %[mask0], %[src] \n\t" "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" "gsldlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gsldrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "and %[dest], %[dest], %[mask1] \n\t" "or %[dest], %[dest], %[dest1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" "gsldlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" "gsldrc1 %[dest], 0x08(%[dst_ptr]) \n\t" "and %[dest], %[dest], %[mask1] \n\t" "or %[dest], %[dest], %[dest1] \n\t" "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" "punpckhbh %[dest0], %[mask0], %[src] \n\t" "punpcklhw %[dest1], %[mask0], %[dest0] \n\t" "gsldlc1 %[dest], 0x17(%[dst_ptr]) \n\t" "gsldrc1 %[dest], 0x10(%[dst_ptr]) \n\t" "and %[dest], %[dest], %[mask1] \n\t" "or %[dest], %[dest], %[dest1] \n\t" "gssdlc1 %[dest], 0x17(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x10(%[dst_ptr]) \n\t" "punpckhhw %[dest1], %[mask0], %[dest0] \n\t" "gsldlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" "gsldrc1 %[dest], 0x18(%[dst_ptr]) \n\t" "and %[dest], %[dest], %[mask1] \n\t" "or %[dest], %[dest], %[dest1] \n\t" "gssdlc1 %[dest], 0x1f(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x18(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x20 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(source), [dest] "=&f"(dest), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1) : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [mask0] "f"(mask0), [mask1] "f"(mask1), [width] "r"(width) : "memory"); } void I444ToARGBRow_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y,u,v; uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile ( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub "or %[ub], %[ub], %[mask] \n\t"//must sign extension "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask] \n\t"//sign extension "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 "pmulhuh %[y], %[y], %[yg] \n\t"//y1 "punpcklbh %[u], %[u], %[zero] \n\t"//u "paddsh %[b_vec0], %[y], %[bb] \n\t" "pmullh %[b_vec1], %[u], %[ub] \n\t" "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" "psrah %[b_vec0], %[b_vec0], %[six] \n\t" "punpcklbh %[v], %[v], %[zero] \n\t"//v "paddsh %[g_vec0], %[y], %[bg] \n\t" "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" "psrah %[g_vec0], %[g_vec0], %[six] \n\t" "paddsh %[r_vec0], %[y], %[br] \n\t" "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" "psrah %[r_vec0], %[r_vec0], %[six] \n\t" "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [alpha]"f"(-1), [six]"f"(0x6), [five]"f"(0x55), [mask]"f"(mask) : "memory" ); } // Also used for 420 void I422ToARGBRow_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y,u,v; uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub "or %[ub], %[ub], %[mask] \n\t"//must sign extension "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask] \n\t"//sign extension "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 "pmulhuh %[y], %[y], %[yg] \n\t"//y1 //u3|u2|u1|u0 --> u1|u1|u0|u0 "punpcklbh %[u], %[u], %[u] \n\t"//u "punpcklbh %[u], %[u], %[zero] \n\t" "paddsh %[b_vec0], %[y], %[bb] \n\t" "pmullh %[b_vec1], %[u], %[ub] \n\t" "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" "psrah %[b_vec0], %[b_vec0], %[six] \n\t" //v3|v2|v1|v0 --> v1|v1|v0|v0 "punpcklbh %[v], %[v], %[v] \n\t"//v "punpcklbh %[v], %[v], %[zero] \n\t" "paddsh %[g_vec0], %[y], %[bg] \n\t" "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" "psrah %[g_vec0], %[g_vec0], %[six] \n\t" "paddsh %[r_vec0], %[y], %[br] \n\t" "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" "psrah %[r_vec0], %[r_vec0], %[six] \n\t" "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [alpha]"f"(-1), [six]"f"(0x6), [five]"f"(0x55), [mask]"f"(mask) : "memory" ); } // 10 bit YUV to ARGB void I210ToARGBRow_MMI(const uint16_t* src_y, const uint16_t* src_u, const uint16_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y,u,v; uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask] \n\t" "1: \n\t" "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" "psllh %[y], %[y], %[six] \n\t" "pmulhuh %[y], %[y], %[yg] \n\t" "punpcklhw %[u], %[u], %[u] \n\t" "psrah %[u], %[u], %[two] \n\t" "punpcklhw %[v], %[v], %[v] \n\t" "psrah %[v], %[v], %[two] \n\t" "pminsh %[u], %[u], %[mask1] \n\t" "pminsh %[v], %[v], %[mask1] \n\t" "paddsh %[b_vec0], %[y], %[bb] \n\t" "pmullh %[b_vec1], %[u], %[ub] \n\t" "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" "paddsh %[g_vec0], %[y], %[bg] \n\t" "pmullh %[g_vec1], %[u], %[ug] \n\t" "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" "pmullh %[g_vec1], %[v], %[vg] \n\t" "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" "paddsh %[r_vec0], %[y], %[br] \n\t" "pmullh %[r_vec1], %[v], %[vr] \n\t" "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" "psrah %[b_vec0], %[b_vec0], %[six] \n\t" "psrah %[g_vec0], %[g_vec0], %[six] \n\t" "psrah %[r_vec0], %[r_vec0], %[six] \n\t" "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [alpha]"f"(-1), [six]"f"(0x6), [five]"f"(0x55), [mask]"f"(mask), [two]"f"(0x02), [mask1]"f"(0x00ff00ff00ff00ff) : "memory" ); } void I422AlphaToARGBRow_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y,u,v,a; uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask] \n\t" "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 "pmulhuh %[y], %[y], %[yg] \n\t"//y1 //u3|u2|u1|u0 --> u1|u1|u0|u0 "punpcklbh %[u], %[u], %[u] \n\t"//u "punpcklbh %[u], %[u], %[zero] \n\t" "paddsh %[b_vec0], %[y], %[bb] \n\t" "pmullh %[b_vec1], %[u], %[ub] \n\t" "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" "psrah %[b_vec0], %[b_vec0], %[six] \n\t" //v3|v2|v1|v0 --> v1|v1|v0|v0 "punpcklbh %[v], %[v], %[v] \n\t" "punpcklbh %[v], %[v], %[zero] \n\t" "paddsh %[g_vec0], %[y], %[bg] \n\t" "pmullh %[g_vec1], %[u], %[ug] \n\t" "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" "pmullh %[g_vec1], %[v], %[vg] \n\t" "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" "psrah %[g_vec0], %[g_vec0], %[six] \n\t" "paddsh %[r_vec0], %[y], %[br] \n\t" "pmullh %[r_vec1], %[v], %[vr] \n\t" "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" "psrah %[r_vec0], %[r_vec0], %[six] \n\t" "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb "packushb %[g_vec0], %[g_vec0], %[a] \n\t" "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [a]"=&f"(a), [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [a_ptr]"r"(src_a), [zero]"f"(0x00), [six]"f"(0x6), [five]"f"(0x55), [mask]"f"(mask) : "memory" ); } void I422ToRGB24Row_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y,u,v; uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask] \n\t" "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 "pmulhuh %[y], %[y], %[yg] \n\t"//y1 //u3|u2|u1|u0 --> u1|u1|u0|u0 "punpcklbh %[u], %[u], %[u] \n\t"//u "punpcklbh %[u], %[u], %[zero] \n\t" "paddsh %[b_vec0], %[y], %[bb] \n\t" "pmullh %[b_vec1], %[u], %[ub] \n\t" "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" "psrah %[b_vec0], %[b_vec0], %[six] \n\t" //v3|v2|v1|v0 --> v1|v1|v0|v0 "punpcklbh %[v], %[v], %[v] \n\t" "punpcklbh %[v], %[v], %[zero] \n\t" "paddsh %[g_vec0], %[y], %[bg] \n\t" "pmullh %[g_vec1], %[u], %[ug] \n\t" "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" "pmullh %[g_vec1], %[v], %[vg] \n\t" "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" "psrah %[g_vec0], %[g_vec0], %[six] \n\t" "paddsh %[r_vec0], %[y], %[br] \n\t" "pmullh %[r_vec1], %[v], %[vr] \n\t" "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" "psrah %[r_vec0], %[r_vec0], %[six] \n\t" "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask]"f"(mask), [lmove1]"f"(0x18), [rmove1]"f"(0x8), [one]"f"(0x1) : "memory" ); } void I422ToARGB4444Row_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask] \n\t" "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 "pmulhuh %[y], %[y], %[yg] \n\t"//y1 //u3|u2|u1|u0 --> u1|u1|u0|u0 "punpcklbh %[u], %[u], %[u] \n\t"//u "punpcklbh %[u], %[u], %[zero] \n\t" "paddsh %[b_vec], %[y], %[bb] \n\t" "pmullh %[temp], %[u], %[ub] \n\t" "psubsh %[b_vec], %[b_vec], %[temp] \n\t" "psrah %[b_vec], %[b_vec], %[six] \n\t" //v3|v2|v1|v0 --> v1|v1|v0|v0 "punpcklbh %[v], %[v], %[v] \n\t" "punpcklbh %[v], %[v], %[zero] \n\t" "paddsh %[g_vec], %[y], %[bg] \n\t" "pmullh %[temp], %[u], %[ug] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "pmullh %[temp], %[v], %[vg] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "psrah %[g_vec], %[g_vec], %[six] \n\t" "paddsh %[r_vec], %[y], %[br] \n\t" "pmullh %[temp], %[v], %[vr] \n\t" "psubsh %[r_vec], %[r_vec], %[temp] \n\t" "psrah %[r_vec], %[r_vec], %[six] \n\t" "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[zero] \n\t" "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" "and %[g_vec], %[g_vec], %[mask1] \n\t" "psrlw %[g_vec], %[g_vec], %[four] \n\t" "psrlw %[r_vec], %[g_vec], %[four] \n\t" "or %[g_vec], %[g_vec], %[r_vec] \n\t" "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" "and %[g_vec], %[g_vec], %[r_vec] \n\t" "and %[b_vec], %[b_vec], %[mask1] \n\t" "psrlw %[b_vec], %[b_vec], %[four] \n\t" "psrlw %[r_vec], %[b_vec], %[four] \n\t" "or %[b_vec], %[b_vec], %[r_vec] \n\t" "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" "and %[b_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00), [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0), [alpha]"f"(-1) : "memory" ); } void I422ToARGB1555Row_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask1] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask1] \n\t" "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" "punpcklbh %[y], %[y], %[y] \n\t" "pmulhuh %[y], %[y], %[yg] \n\t" //u3|u2|u1|u0 --> u1|u1|u0|u0 "punpcklbh %[u], %[u], %[u] \n\t" "punpcklbh %[u], %[u], %[zero] \n\t" "paddsh %[b_vec], %[y], %[bb] \n\t" "pmullh %[temp], %[u], %[ub] \n\t" "psubsh %[b_vec], %[b_vec], %[temp] \n\t" "psrah %[b_vec], %[b_vec], %[six] \n\t" //v3|v2|v1|v0 --> v1|v1|v0|v0 "punpcklbh %[v], %[v], %[v] \n\t" "punpcklbh %[v], %[v], %[zero] \n\t" "paddsh %[g_vec], %[y], %[bg] \n\t" "pmullh %[temp], %[u], %[ug] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "pmullh %[temp], %[v], %[vg] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "psrah %[g_vec], %[g_vec], %[six] \n\t" "paddsh %[r_vec], %[y], %[br] \n\t" "pmullh %[temp], %[v], %[vr] \n\t" "psubsh %[r_vec], %[r_vec], %[temp] \n\t" "psrah %[r_vec], %[r_vec], %[six] \n\t" "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[zero] \n\t" "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" "psrlw %[temp], %[g_vec], %[three] \n\t" "and %[g_vec], %[temp], %[mask2] \n\t" "psrlw %[temp], %[temp], %[eight] \n\t" "and %[r_vec], %[temp], %[mask2] \n\t" "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" "or %[g_vec], %[g_vec], %[r_vec] \n\t" "psrlw %[temp], %[temp], %[eight] \n\t" "and %[r_vec], %[temp], %[mask2] \n\t" "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" "or %[g_vec], %[g_vec], %[r_vec] \n\t" "or %[g_vec], %[g_vec], %[mask3] \n\t" "psrlw %[temp], %[b_vec], %[three] \n\t" "and %[b_vec], %[temp], %[mask2] \n\t" "psrlw %[temp], %[temp], %[eight] \n\t" "and %[r_vec], %[temp], %[mask2] \n\t" "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" "or %[b_vec], %[b_vec], %[r_vec] \n\t" "psrlw %[temp], %[temp], %[eight] \n\t" "and %[r_vec], %[temp], %[mask2] \n\t" "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" "or %[b_vec], %[b_vec], %[r_vec] \n\t" "or %[b_vec], %[b_vec], %[mask3] \n\t" "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), [three]"f"(0x3), [mask2]"f"(0x1f0000001f), [eight]"f"(0x8), [mask3]"f"(0x800000008000), [lmove5]"f"(0x5) : "memory" ); } void I422ToRGB565Row_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask1] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask1] \n\t" "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" "punpcklbh %[y], %[y], %[y] \n\t" "pmulhuh %[y], %[y], %[yg] \n\t" //u3|u2|u1|u0 --> u1|u1|u0|u0 "punpcklbh %[u], %[u], %[u] \n\t" "punpcklbh %[u], %[u], %[zero] \n\t" "paddsh %[b_vec], %[y], %[bb] \n\t" "pmullh %[temp], %[u], %[ub] \n\t" "psubsh %[b_vec], %[b_vec], %[temp] \n\t" "psrah %[b_vec], %[b_vec], %[six] \n\t" //v3|v2|v1|v0 --> v1|v1|v0|v0 "punpcklbh %[v], %[v], %[v] \n\t" "punpcklbh %[v], %[v], %[zero] \n\t" "paddsh %[g_vec], %[y], %[bg] \n\t" "pmullh %[temp], %[u], %[ug] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "pmullh %[temp], %[v], %[vg] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "psrah %[g_vec], %[g_vec], %[six] \n\t" "paddsh %[r_vec], %[y], %[br] \n\t" "pmullh %[temp], %[v], %[vr] \n\t" "psubsh %[r_vec], %[r_vec], %[temp] \n\t" "psrah %[r_vec], %[r_vec], %[six] \n\t" "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[zero] \n\t" "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" "psrlh %[temp], %[g_vec], %[three] \n\t" "and %[g_vec], %[temp], %[mask2] \n\t" "psrlw %[temp], %[temp], %[seven] \n\t" "psrlw %[r_vec], %[mask1], %[eight] \n\t" "and %[r_vec], %[temp], %[r_vec] \n\t" "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" "or %[g_vec], %[g_vec], %[r_vec] \n\t" "paddb %[r_vec], %[three], %[six] \n\t" "psrlw %[temp], %[temp], %[r_vec] \n\t" "and %[r_vec], %[temp], %[mask2] \n\t" "paddb %[temp], %[three], %[eight] \n\t" "psllw %[r_vec], %[r_vec], %[temp] \n\t" "or %[g_vec], %[g_vec], %[r_vec] \n\t" "psrlh %[temp], %[b_vec], %[three] \n\t" "and %[b_vec], %[temp], %[mask2] \n\t" "psrlw %[temp], %[temp], %[seven] \n\t" "psrlw %[r_vec], %[mask1], %[eight] \n\t" "and %[r_vec], %[temp], %[r_vec] \n\t" "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" "or %[b_vec], %[b_vec], %[r_vec] \n\t" "paddb %[r_vec], %[three], %[six] \n\t" "psrlw %[temp], %[temp], %[r_vec] \n\t" "and %[r_vec], %[temp], %[mask2] \n\t" "paddb %[temp], %[three], %[eight] \n\t" "psllw %[r_vec], %[r_vec], %[temp] \n\t" "or %[b_vec], %[b_vec], %[r_vec] \n\t" "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), [three]"f"(0x3), [mask2]"f"(0x1f0000001f), [eight]"f"(0x8), [seven]"f"(0x7), [lmove5]"f"(0x5) : "memory" ); } void NV12ToARGBRow_MMI(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask1] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask1] \n\t" "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" "punpcklbh %[u], %[u], %[zero] \n\t" "pshufh %[v], %[u], %[vshu] \n\t" "pshufh %[u], %[u], %[ushu] \n\t" "punpcklbh %[y], %[y], %[y] \n\t" "pmulhuh %[y], %[y], %[yg] \n\t" "paddsh %[b_vec], %[y], %[bb] \n\t" "pmullh %[temp], %[u], %[ub] \n\t" "psubsh %[b_vec], %[b_vec], %[temp] \n\t" "psrah %[b_vec], %[b_vec], %[six] \n\t" "paddsh %[g_vec], %[y], %[bg] \n\t" "pmullh %[temp], %[u], %[ug] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "pmullh %[temp], %[v], %[vg] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "psrah %[g_vec], %[g_vec], %[six] \n\t" "paddsh %[r_vec], %[y], %[br] \n\t" "pmullh %[temp], %[v], %[vr] \n\t" "psubsh %[r_vec], %[r_vec], %[temp] \n\t" "psrah %[r_vec], %[r_vec], %[six] \n\t" "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[zero] \n\t" "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), [ushu]"f"(0xA0), [vshu]"f"(0xf5), [alpha]"f"(-1) : "memory" ); } void NV21ToARGBRow_MMI(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask1] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask1] \n\t" "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" "punpcklbh %[u], %[u], %[zero] \n\t" "pshufh %[v], %[u], %[ushu] \n\t" "pshufh %[u], %[u], %[vshu] \n\t" "punpcklbh %[y], %[y], %[y] \n\t" "pmulhuh %[y], %[y], %[yg] \n\t" "paddsh %[b_vec], %[y], %[bb] \n\t" "pmullh %[temp], %[u], %[ub] \n\t" "psubsh %[b_vec], %[b_vec], %[temp] \n\t" "psrah %[b_vec], %[b_vec], %[six] \n\t" "paddsh %[g_vec], %[y], %[bg] \n\t" "pmullh %[temp], %[u], %[ug] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "pmullh %[temp], %[v], %[vg] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "psrah %[g_vec], %[g_vec], %[six] \n\t" "paddsh %[r_vec], %[y], %[br] \n\t" "pmullh %[temp], %[v], %[vr] \n\t" "psubsh %[r_vec], %[r_vec], %[temp] \n\t" "psrah %[r_vec], %[r_vec], %[six] \n\t" "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[zero] \n\t" "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), [ushu]"f"(0xA0), [vshu]"f"(0xf5), [alpha]"f"(-1) : "memory" ); } void NV12ToRGB24Row_MMI(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask1] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask1] \n\t" "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" "punpcklbh %[u], %[u], %[zero] \n\t" "pshufh %[v], %[u], %[vshu] \n\t" "pshufh %[u], %[u], %[ushu] \n\t" "punpcklbh %[y], %[y], %[y] \n\t" "pmulhuh %[y], %[y], %[yg] \n\t" "paddsh %[b_vec], %[y], %[bb] \n\t" "pmullh %[temp], %[u], %[ub] \n\t" "psubsh %[b_vec], %[b_vec], %[temp] \n\t" "psrah %[b_vec], %[b_vec], %[six] \n\t" "paddsh %[g_vec], %[y], %[bg] \n\t" "pmullh %[temp], %[u], %[ug] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "pmullh %[temp], %[v], %[vg] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "psrah %[g_vec], %[g_vec], %[six] \n\t" "paddsh %[r_vec], %[y], %[br] \n\t" "pmullh %[temp], %[v], %[vr] \n\t" "psubsh %[r_vec], %[r_vec], %[temp] \n\t" "psrah %[r_vec], %[r_vec], %[six] \n\t" "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[zero] \n\t" "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" "psllw %[temp], %[r_vec], %[lmove1] \n\t" "or %[g_vec], %[g_vec], %[temp] \n\t" "psrlw %[temp], %[r_vec], %[rmove1] \n\t" "pextrh %[temp], %[temp], %[zero] \n\t" "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" "pextrh %[temp], %[b_vec], %[zero] \n\t" "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" "pextrh %[temp], %[b_vec], %[one] \n\t" "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" "or %[b_vec], %[b_vec], %[temp] \n\t" "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), [ushu]"f"(0xA0), [vshu]"f"(0xf5), [alpha]"f"(-1), [lmove1]"f"(0x18), [one]"f"(0x1), [rmove1]"f"(0x8) : "memory" ); } void NV21ToRGB24Row_MMI(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask1] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask1] \n\t" "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" "punpcklbh %[u], %[u], %[zero] \n\t" "pshufh %[v], %[u], %[ushu] \n\t" "pshufh %[u], %[u], %[vshu] \n\t" "punpcklbh %[y], %[y], %[y] \n\t" "pmulhuh %[y], %[y], %[yg] \n\t" "paddsh %[b_vec], %[y], %[bb] \n\t" "pmullh %[temp], %[u], %[ub] \n\t" "psubsh %[b_vec], %[b_vec], %[temp] \n\t" "psrah %[b_vec], %[b_vec], %[six] \n\t" "paddsh %[g_vec], %[y], %[bg] \n\t" "pmullh %[temp], %[u], %[ug] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "pmullh %[temp], %[v], %[vg] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "psrah %[g_vec], %[g_vec], %[six] \n\t" "paddsh %[r_vec], %[y], %[br] \n\t" "pmullh %[temp], %[v], %[vr] \n\t" "psubsh %[r_vec], %[r_vec], %[temp] \n\t" "psrah %[r_vec], %[r_vec], %[six] \n\t" "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[zero] \n\t" "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" "psllw %[temp], %[r_vec], %[lmove1] \n\t" "or %[g_vec], %[g_vec], %[temp] \n\t" "psrlw %[temp], %[r_vec], %[rmove1] \n\t" "pextrh %[temp], %[temp], %[zero] \n\t" "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" "pextrh %[temp], %[b_vec], %[zero] \n\t" "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" "pextrh %[temp], %[b_vec], %[one] \n\t" "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" "or %[b_vec], %[b_vec], %[temp] \n\t" "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), [ushu]"f"(0xA0), [vshu]"f"(0xf5), [lmove1]"f"(0x18), [rmove1]"f"(0x8), [one]"f"(0x1) : "memory" ); } void NV12ToRGB565Row_MMI(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask1] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask1] \n\t" "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" "punpcklbh %[u], %[u], %[zero] \n\t" "pshufh %[v], %[u], %[vshu] \n\t" "pshufh %[u], %[u], %[ushu] \n\t" "punpcklbh %[y], %[y], %[y] \n\t" "pmulhuh %[y], %[y], %[yg] \n\t" "paddsh %[b_vec], %[y], %[bb] \n\t" "pmullh %[temp], %[u], %[ub] \n\t" "psubsh %[b_vec], %[b_vec], %[temp] \n\t" "psrah %[b_vec], %[b_vec], %[six] \n\t" "paddsh %[g_vec], %[y], %[bg] \n\t" "pmullh %[temp], %[u], %[ug] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "pmullh %[temp], %[v], %[vg] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "psrah %[g_vec], %[g_vec], %[six] \n\t" "paddsh %[r_vec], %[y], %[br] \n\t" "pmullh %[temp], %[v], %[vr] \n\t" "psubsh %[r_vec], %[r_vec], %[temp] \n\t" "psrah %[r_vec], %[r_vec], %[six] \n\t" "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[zero] \n\t" "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" "psrlh %[temp], %[g_vec], %[three] \n\t" "and %[g_vec], %[temp], %[mask2] \n\t" "psrlw %[temp], %[temp], %[seven] \n\t" "psrlw %[r_vec], %[mask1], %[eight] \n\t" "and %[r_vec], %[temp], %[r_vec] \n\t" "psubb %[y], %[eight], %[three] \n\t"//5 "psllw %[r_vec], %[r_vec], %[y] \n\t" "or %[g_vec], %[g_vec], %[r_vec] \n\t" "paddb %[r_vec], %[three], %[six] \n\t" "psrlw %[temp], %[temp], %[r_vec] \n\t" "and %[r_vec], %[temp], %[mask2] \n\t" "paddb %[temp], %[three], %[eight] \n\t" "psllw %[r_vec], %[r_vec], %[temp] \n\t" "or %[g_vec], %[g_vec], %[r_vec] \n\t" "psrlh %[temp], %[b_vec], %[three] \n\t" "and %[b_vec], %[temp], %[mask2] \n\t" "psrlw %[temp], %[temp], %[seven] \n\t" "psrlw %[r_vec], %[mask1], %[eight] \n\t" "and %[r_vec], %[temp], %[r_vec] \n\t" "psubb %[y], %[eight], %[three] \n\t"//5 "psllw %[r_vec], %[r_vec], %[y] \n\t" "or %[b_vec], %[b_vec], %[r_vec] \n\t" "paddb %[r_vec], %[three], %[six] \n\t" "psrlw %[temp], %[temp], %[r_vec] \n\t" "and %[r_vec], %[temp], %[mask2] \n\t" "paddb %[temp], %[three], %[eight] \n\t" "psllw %[r_vec], %[r_vec], %[temp] \n\t" "or %[b_vec], %[b_vec], %[r_vec] \n\t" "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), [dst_rgb565]"r"(dst_rgb565), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), [ushu]"f"(0xA0), [vshu]"f"(0xf5), [three]"f"(0x3), [mask2]"f"(0x1f0000001f), [eight]"f"(0x8), [seven]"f"(0x7) : "memory" ); } void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask1] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask1] \n\t" "1: \n\t" "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" "psrlh %[temp], %[y], %[eight] \n\t" "pshufh %[u], %[temp], %[ushu] \n\t" "pshufh %[v], %[temp], %[vshu] \n\t" "psrlh %[temp], %[mask1], %[eight] \n\t" "and %[y], %[y], %[temp] \n\t" "psllh %[temp], %[y], %[eight] \n\t" "or %[y], %[y], %[temp] \n\t" "pmulhuh %[y], %[y], %[yg] \n\t" "paddsh %[b_vec], %[y], %[bb] \n\t" "pmullh %[temp], %[u], %[ub] \n\t" "psubsh %[b_vec], %[b_vec], %[temp] \n\t" "psrah %[b_vec], %[b_vec], %[six] \n\t" "paddsh %[g_vec], %[y], %[bg] \n\t" "pmullh %[temp], %[u], %[ug] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "pmullh %[temp], %[v], %[vg] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "psrah %[g_vec], %[g_vec], %[six] \n\t" "paddsh %[r_vec], %[y], %[br] \n\t" "pmullh %[temp], %[v], %[vr] \n\t" "psubsh %[r_vec], %[r_vec], %[temp] \n\t" "psrah %[r_vec], %[r_vec], %[six] \n\t" "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[zero] \n\t" "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), [ushu]"f"(0xA0), [vshu]"f"(0xf5), [alpha]"f"(-1), [eight]"f"(0x8) : "memory" ); } void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask1] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask1] \n\t" "1: \n\t" "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" "psrlh %[temp], %[mask1], %[eight] \n\t" "and %[temp], %[y], %[temp] \n\t" "pshufh %[u], %[temp], %[ushu] \n\t" "pshufh %[v], %[temp], %[vshu] \n\t" "psrlh %[y], %[y], %[eight] \n\t" "psllh %[temp], %[y], %[eight] \n\t" "or %[y], %[y], %[temp] \n\t" "pmulhuh %[y], %[y], %[yg] \n\t" "paddsh %[b_vec], %[y], %[bb] \n\t" "pmullh %[temp], %[u], %[ub] \n\t" "psubsh %[b_vec], %[b_vec], %[temp] \n\t" "psrah %[b_vec], %[b_vec], %[six] \n\t" "paddsh %[g_vec], %[y], %[bg] \n\t" "pmullh %[temp], %[u], %[ug] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "pmullh %[temp], %[v], %[vg] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "psrah %[g_vec], %[g_vec], %[six] \n\t" "paddsh %[r_vec], %[y], %[br] \n\t" "pmullh %[temp], %[v], %[vr] \n\t" "psubsh %[r_vec], %[r_vec], %[temp] \n\t" "psrah %[r_vec], %[r_vec], %[six] \n\t" "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[zero] \n\t" "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), [ushu]"f"(0xA0), [vshu]"f"(0xf5), [alpha]"f"(-1), [eight]"f"(0x8) : "memory" ); } void I422ToRGBARow_MMI(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" "or %[ub], %[ub], %[mask1] \n\t" "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[ug], %[ug], %[zero] \n\t" "pshufh %[ug], %[ug], %[zero] \n\t" "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" "punpcklbh %[vg], %[vg], %[zero] \n\t" "pshufh %[vg], %[vg], %[five] \n\t" "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" "punpcklbh %[vr], %[vr], %[zero] \n\t" "pshufh %[vr], %[vr], %[five] \n\t" "or %[vr], %[vr], %[mask1] \n\t" "1: \n\t" "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" "punpcklbh %[y], %[y], %[y] \n\t" "pmulhuh %[y], %[y], %[yg] \n\t" "punpcklbh %[u], %[u], %[u] \n\t" "punpcklbh %[u], %[u], %[zero] \n\t" "paddsh %[b_vec], %[y], %[bb] \n\t" "pmullh %[temp], %[u], %[ub] \n\t" "psubsh %[b_vec], %[b_vec], %[temp] \n\t" "psrah %[b_vec], %[b_vec], %[six] \n\t" "punpcklbh %[v], %[v], %[v] \n\t" "punpcklbh %[v], %[v], %[zero] \n\t" "paddsh %[g_vec], %[y], %[bg] \n\t" "pmullh %[temp], %[u], %[ug] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "pmullh %[temp], %[v], %[vg] \n\t" "psubsh %[g_vec], %[g_vec], %[temp] \n\t" "psrah %[g_vec], %[g_vec], %[six] \n\t" "paddsh %[r_vec], %[y], %[br] \n\t" "pmullh %[temp], %[v], %[vr] \n\t" "psubsh %[r_vec], %[r_vec], %[temp] \n\t" "psrah %[r_vec], %[r_vec], %[six] \n\t" "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" "packushb %[g_vec], %[g_vec], %[zero] \n\t" "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [y]"=&f"(y), [u]"=&f"(u), [v]"=&f"(v), [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), [ub]"=&f"(ub), [ug]"=&f"(ug), [vg]"=&f"(vg), [vr]"=&f"(vr), [bb]"=&f"(bb), [bg]"=&f"(bg), [br]"=&f"(br), [yg]"=&f"(yg) : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), [zero]"f"(0x00), [five]"f"(0x55), [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), [alpha]"f"(-1) : "memory" ); } void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) { __asm__ volatile ( "punpcklwd %[v32], %[v32], %[v32] \n\t" "1: \n\t" "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" "daddi %[width], %[width], -0x04 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" "bnez %[width], 1b \n\t" : [v32]"+&f"(v32) : [dst_ptr]"r"(dst_argb), [width]"r"(width) : "memory" ); } // clang-format on // 10 bit YUV to ARGB #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/row_msa.cc000066400000000000000000004327761416500237200210270ustar00rootroot00000000000000/* * Copyright 2016 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include "libyuv/row.h" // This module is for GCC MSA #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #include "libyuv/macros_msa.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #define ALPHA_VAL (-1) // Fill YUV -> RGB conversion constants into vectors #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \ { \ ub = __msa_fill_w(yuvconst->kUVToB[0]); \ vr = __msa_fill_w(yuvconst->kUVToR[1]); \ ug = __msa_fill_w(yuvconst->kUVToG[0]); \ vg = __msa_fill_w(yuvconst->kUVToG[1]); \ bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \ bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \ br = __msa_fill_w(yuvconst->kUVBiasR[0]); \ yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ } // Load YUV 422 pixel data #define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ { \ uint64_t y_m; \ uint32_t u_m, v_m; \ v4i32 zero_m = {0}; \ y_m = LD(psrc_y); \ u_m = LW(psrc_u); \ v_m = LW(psrc_v); \ out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \ out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m); \ out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m); \ } // Clip input vector elements between 0 to 255 #define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \ { \ v4i32 max_m = __msa_ldi_w(0xFF); \ \ in0 = __msa_maxi_s_w(in0, 0); \ in1 = __msa_maxi_s_w(in1, 0); \ in2 = __msa_maxi_s_w(in2, 0); \ in3 = __msa_maxi_s_w(in3, 0); \ in4 = __msa_maxi_s_w(in4, 0); \ in5 = __msa_maxi_s_w(in5, 0); \ in0 = __msa_min_s_w(max_m, in0); \ in1 = __msa_min_s_w(max_m, in1); \ in2 = __msa_min_s_w(max_m, in2); \ in3 = __msa_min_s_w(max_m, in3); \ in4 = __msa_min_s_w(max_m, in4); \ in5 = __msa_min_s_w(max_m, in5); \ } // Convert 8 pixels of YUV 420 to RGB. #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ { \ v8i16 vec0_m, vec1_m; \ v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ v4i32 reg5_m, reg6_m, reg7_m; \ v16i8 zero_m = {0}; \ \ vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \ reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \ reg0_m *= yg; \ reg1_m *= yg; \ reg2_m *= ubvr; \ reg3_m *= ubvr; \ reg0_m = __msa_srai_w(reg0_m, 16); \ reg1_m = __msa_srai_w(reg1_m, 16); \ reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ reg5_m = reg0_m - reg5_m; \ reg6_m = reg1_m - reg6_m; \ reg2_m = reg0_m - reg2_m; \ reg3_m = reg1_m - reg3_m; \ reg7_m = reg0_m - reg7_m; \ reg4_m = reg1_m - reg4_m; \ reg5_m += bb; \ reg6_m += bb; \ reg7_m += bg; \ reg4_m += bg; \ reg2_m += br; \ reg3_m += br; \ reg5_m = __msa_srai_w(reg5_m, 6); \ reg6_m = __msa_srai_w(reg6_m, 6); \ reg7_m = __msa_srai_w(reg7_m, 6); \ reg4_m = __msa_srai_w(reg4_m, 6); \ reg2_m = __msa_srai_w(reg2_m, 6); \ reg3_m = __msa_srai_w(reg3_m, 6); \ CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ } // Pack and Store 8 ARGB values. #define STOREARGB(in0, in1, in2, in3, pdst_argb) \ { \ v8i16 vec0_m, vec1_m; \ v16u8 dst0_m, dst1_m; \ vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ } // Takes ARGB input and calculates Y. #define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \ y_out) \ { \ v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \ v8u16 reg0_m, reg1_m; \ \ vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \ vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \ vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \ vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \ reg0_m = __msa_dotp_u_h(vec0_m, const0); \ reg1_m = __msa_dotp_u_h(vec1_m, const0); \ reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \ reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \ reg0_m += const2; \ reg1_m += const2; \ reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \ reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \ y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ } // Loads current and next row of ARGB input and averages it to calculate U and V #define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3, const_0x0101) \ { \ v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ v8u16 reg8_m, reg9_m; \ \ src0_m = (v16u8)__msa_ld_b((void*)s, 0); \ src1_m = (v16u8)__msa_ld_b((void*)s, 16); \ src2_m = (v16u8)__msa_ld_b((void*)s, 32); \ src3_m = (v16u8)__msa_ld_b((void*)s, 48); \ src4_m = (v16u8)__msa_ld_b((void*)t, 0); \ src5_m = (v16u8)__msa_ld_b((void*)t, 16); \ src6_m = (v16u8)__msa_ld_b((void*)t, 32); \ src7_m = (v16u8)__msa_ld_b((void*)t, 48); \ vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \ reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \ reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \ reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \ reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \ reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \ reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \ reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \ reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ reg8_m += const_0x0101; \ reg9_m += const_0x0101; \ reg0_m += const_0x0101; \ reg1_m += const_0x0101; \ argb0 = (v8u16)__msa_srai_h((v8i16)reg8_m, 1); \ argb1 = (v8u16)__msa_srai_h((v8i16)reg9_m, 1); \ argb2 = (v8u16)__msa_srai_h((v8i16)reg0_m, 1); \ argb3 = (v8u16)__msa_srai_h((v8i16)reg1_m, 1); \ } #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ shf0, shf1, shf2, shf3, shift, u_out, v_out) \ { \ v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ \ vec0_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ vec1_m = (v8u16)__msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ vec2_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ vec3_m = (v8u16)__msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ vec4_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ vec5_m = (v8u16)__msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ vec6_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ vec7_m = (v8u16)__msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ reg0_m = __msa_dotp_u_w(vec0_m, const0); \ reg1_m = __msa_dotp_u_w(vec1_m, const0); \ reg2_m = __msa_dotp_u_w(vec4_m, const0); \ reg3_m = __msa_dotp_u_w(vec5_m, const0); \ reg0_m += const1; \ reg1_m += const1; \ reg2_m += const1; \ reg3_m += const1; \ reg0_m -= (v4u32)__msa_dotp_u_w(vec2_m, const2); \ reg1_m -= (v4u32)__msa_dotp_u_w(vec3_m, const2); \ reg2_m -= (v4u32)__msa_dotp_u_w(vec6_m, const3); \ reg3_m -= (v4u32)__msa_dotp_u_w(vec7_m, const3); \ reg0_m = __msa_srl_w(reg0_m, shift); \ reg1_m = __msa_srl_w(reg1_m, shift); \ reg2_m = __msa_srl_w(reg2_m, shift); \ reg3_m = __msa_srl_w(reg3_m, shift); \ u_out = (v8u16)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ v_out = (v8u16)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ } // Takes ARGB input and calculates U and V. #define ARGBTOUV_H(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ shf0, shf1, shf2, shf3, v_out, u_out) \ { \ v8u16 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ v4u32 reg0_m, reg1_m, reg2_m, reg3_m; \ \ vec0_m = __msa_vshf_h(shf0, (v16i8)argb1, (v16i8)argb0); \ vec1_m = __msa_vshf_h(shf0, (v16i8)argb3, (v16i8)argb2); \ vec2_m = __msa_vshf_h(shf1, (v16i8)argb1, (v16i8)argb0); \ vec3_m = __msa_vshf_h(shf1, (v16i8)argb3, (v16i8)argb2); \ vec4_m = __msa_vshf_h(shf2, (v16i8)argb1, (v16i8)argb0); \ vec5_m = __msa_vshf_h(shf2, (v16i8)argb3, (v16i8)argb2); \ vec6_m = __msa_vshf_h(shf3, (v16i8)argb1, (v16i8)argb0); \ vec7_m = __msa_vshf_h(shf3, (v16i8)argb3, (v16i8)argb2); \ reg0_m = __msa_dotp_u_w(vec0_m, const1); \ reg1_m = __msa_dotp_u_w(vec1_m, const1); \ reg2_m = __msa_dotp_u_w(vec4_m, const1); \ reg3_m = __msa_dotp_u_w(vec5_m, const1); \ reg0_m += (v4u32)const3; \ reg1_m += (v4u32)const3; \ reg2_m += (v4u32)const3; \ reg3_m += (v4u32)const3; \ reg0_m -= __msa_dotp_u_w(vec2_m, const0); \ reg1_m -= __msa_dotp_u_w(vec3_m, const0); \ reg2_m -= __msa_dotp_u_w(vec6_m, const2); \ reg3_m -= __msa_dotp_u_w(vec7_m, const2); \ u_out = (v16u8)__msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ v_out = (v16u8)__msa_pckev_h((v8i16)reg1_m, (v8i16)reg0_m); \ u_out = (v16u8)__msa_pckod_b((v16i8)u_out, (v16i8)u_out); \ v_out = (v16u8)__msa_pckod_b((v16i8)v_out, (v16i8)v_out); \ } // Load I444 pixel data #define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ { \ uint64_t y_m, u_m, v_m; \ v2i64 zero_m = {0}; \ y_m = LD(psrc_y); \ u_m = LD(psrc_u); \ v_m = LD(psrc_v); \ out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m); \ out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m); \ out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \ } void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { int x; v16u8 src0, src1, src2, src3; v16u8 dst0, dst1, dst2, dst3; v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; src += width - 64; for (x = 0; x < width; x += 64) { LD_UB4(src, 16, src3, src2, src1, src0); VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); ST_UB4(dst0, dst1, dst2, dst3, dst, 16); dst += 64; src -= 64; } } void MirrorUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_uv, int width) { int x; v8u16 src, dst; v8u16 shuffler = {7, 6, 5, 4, 3, 2, 1, 0}; src_uv += (width - 8) << 1; for (x = 0; x < width; x += 8) { src = LD_UH(src_uv); dst = __msa_vshf_h(shuffler, src, src); ST_UH(dst, dst_uv); src_uv -= 16; dst_uv += 16; } } void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { int x; v16u8 src0, src1, src2, src3; v16u8 dst0, dst1, dst2, dst3; v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; src += width * 4 - 64; for (x = 0; x < width; x += 16) { LD_UB4(src, 16, src3, src2, src1, src0); VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); ST_UB4(dst0, dst1, dst2, dst3, dst, 16); dst += 64; src -= 64; } } void I422ToYUY2Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { int x; v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3; for (x = 0; x < width; x += 32) { src_u0 = LD_UB(src_u); src_v0 = LD_UB(src_v); LD_UB2(src_y, 16, src_y0, src_y1); ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1); ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3); ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16); src_u += 16; src_v += 16; src_y += 32; dst_yuy2 += 64; } } void I422ToUYVYRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { int x; v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3; for (x = 0; x < width; x += 32) { src_u0 = LD_UB(src_u); src_v0 = LD_UB(src_v); LD_UB2(src_y, 16, src_y0, src_y1); ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); src_u += 16; src_v += 16; src_y += 32; dst_uyvy += 64; } } void I422ToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_y += 8; src_u += 4; src_v += 4; dst_argb += 32; } } void I422ToRGBARow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); STOREARGB(alpha, vec0, vec1, vec2, dst_argb); src_y += 8; src_u += 4; src_v += 4; dst_argb += 32; } } void I422AlphaToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; int64_t data_a; v16u8 src0, src1, src2, src3; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; v4i32 zero = {0}; YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { data_a = LD(src_a); READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); STOREARGB(vec0, vec1, vec2, src3, dst_argb); src_y += 8; src_u += 4; src_v += 4; src_a += 8; dst_argb += 32; } } void I422ToRGB24Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int32_t width) { int x; int64_t data_u, data_v; v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; v8i16 vec0, vec1, vec2, vec3, vec4, vec5; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; v16u8 reg0, reg1, reg2, reg3; v2i64 zero = {0}; v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31}; YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); data_u = LD(src_u); data_v = LD(src_v); src1 = (v16u8)__msa_insert_d(zero, 0, data_u); src2 = (v16u8)__msa_insert_d(zero, 0, data_v); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec3, vec4, vec5); reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); ST_UB2(dst0, dst1, dst_argb, 16); ST_UB(dst2, (dst_argb + 32)); src_y += 16; src_u += 8; src_v += 8; dst_argb += 48; } } // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. void I422ToRGB565Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { int x; v16u8 src0, src1, src2, dst0; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec2, vec1); vec0 = __msa_srai_h(vec0, 3); vec1 = __msa_srai_h(vec1, 3); vec2 = __msa_srai_h(vec2, 2); vec1 = __msa_slli_h(vec1, 11); vec2 = __msa_slli_h(vec2, 5); vec0 |= vec1; dst0 = (v16u8)(vec2 | vec0); ST_UB(dst0, dst_rgb565); src_y += 8; src_u += 4; src_v += 4; dst_rgb565 += 16; } } // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. void I422ToARGB4444Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { int x; v16u8 src0, src1, src2, dst0; v8i16 vec0, vec1, vec2; v8u16 reg0, reg1, reg2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); reg0 = (v8u16)__msa_srai_h(vec0, 4); reg1 = (v8u16)__msa_srai_h(vec1, 4); reg2 = (v8u16)__msa_srai_h(vec2, 4); reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); reg1 |= const_0xF000; reg0 |= reg2; dst0 = (v16u8)(reg1 | reg0); ST_UB(dst0, dst_argb4444); src_y += 8; src_u += 4; src_v += 4; dst_argb4444 += 16; } } void I422ToARGB1555Row_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { int x; v16u8 src0, src1, src2, dst0; v8i16 vec0, vec1, vec2; v8u16 reg0, reg1, reg2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { READYUV422(src_y, src_u, src_v, src0, src1, src2); src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); reg0 = (v8u16)__msa_srai_h(vec0, 3); reg1 = (v8u16)__msa_srai_h(vec1, 3); reg2 = (v8u16)__msa_srai_h(vec2, 3); reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); reg1 |= const_0x8000; reg0 |= reg2; dst0 = (v16u8)(reg1 | reg0); ST_UB(dst0, dst_argb1555); src_y += 8; src_u += 4; src_v += 4; dst_argb1555 += 16; } } void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 32) { LD_UB4(src_yuy2, 16, src0, src1, src2, src3); dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); ST_UB2(dst0, dst1, dst_y, 16); src_yuy2 += 64; dst_y += 32; } } void YUY2ToUVRow_MSA(const uint8_t* src_yuy2, int src_stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2; int x; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 vec0, vec1, dst0, dst1; for (x = 0; x < width; x += 32) { LD_UB4(src_yuy2, 16, src0, src1, src2, src3); LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7); src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); vec0 = __msa_aver_u_b(src0, src2); vec1 = __msa_aver_u_b(src1, src3); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_u); ST_UB(dst1, dst_v); src_yuy2 += 64; src_yuy2_next += 64; dst_u += 16; dst_v += 16; } } void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 32) { LD_UB4(src_yuy2, 16, src0, src1, src2, src3); src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); ST_UB(dst0, dst_u); ST_UB(dst1, dst_v); src_yuy2 += 64; dst_u += 16; dst_v += 16; } } void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 32) { LD_UB4(src_uyvy, 16, src0, src1, src2, src3); dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); ST_UB2(dst0, dst1, dst_y, 16); src_uyvy += 64; dst_y += 32; } } void UYVYToUVRow_MSA(const uint8_t* src_uyvy, int src_stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy; int x; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 vec0, vec1, dst0, dst1; for (x = 0; x < width; x += 32) { LD_UB4(src_uyvy, 16, src0, src1, src2, src3); LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7); src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); vec0 = __msa_aver_u_b(src0, src2); vec1 = __msa_aver_u_b(src1, src3); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_u); ST_UB(dst1, dst_v); src_uyvy += 64; src_uyvy_next += 64; dst_u += 16; dst_v += 16; } } void UYVYToUV422Row_MSA(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 32) { LD_UB4(src_uyvy, 16, src0, src1, src2, src3); src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); ST_UB(dst0, dst_u); ST_UB(dst1, dst_v); src_uyvy += 64; dst_u += 16; dst_v += 16; } } void ARGBToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; v8u16 reg0, reg1, reg2, reg3, reg4, reg5; v16i8 zero = {0}; v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0); reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1); reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2); reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3); reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0); reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1); reg0 *= const_0x19; reg1 *= const_0x19; reg2 *= const_0x81; reg3 *= const_0x81; reg4 *= const_0x42; reg5 *= const_0x42; reg0 += reg2; reg1 += reg3; reg0 += reg4; reg1 += reg5; reg0 += const_0x1080; reg1 += const_0x1080; reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); ST_UB(dst0, dst_y); src_argb += 64; dst_y += 16; } } void ARGBToUVRow_MSA(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; const uint8_t* src_argb_next = src_argb + src_stride_argb; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; v16u8 dst0, dst1; v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x38); v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x25); v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x13); v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x2f); v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); for (x = 0; x < width; x += 32) { src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); src2 = (v16u8)__msa_ld_b((v16u8*)src_argb, 32); src3 = (v16u8)__msa_ld_b((v16u8*)src_argb, 48); src4 = (v16u8)__msa_ld_b((v16u8*)src_argb, 64); src5 = (v16u8)__msa_ld_b((v16u8*)src_argb, 80); src6 = (v16u8)__msa_ld_b((v16u8*)src_argb, 96); src7 = (v16u8)__msa_ld_b((v16u8*)src_argb, 112); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); reg0 = __msa_hadd_u_h(vec8, vec8); reg1 = __msa_hadd_u_h(vec9, vec9); reg2 = __msa_hadd_u_h(vec4, vec4); reg3 = __msa_hadd_u_h(vec5, vec5); reg4 = __msa_hadd_u_h(vec0, vec0); reg5 = __msa_hadd_u_h(vec1, vec1); src0 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 0); src1 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 16); src2 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 32); src3 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 48); src4 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 64); src5 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 80); src6 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 96); src7 = (v16u8)__msa_ld_b((v16u8*)src_argb_next, 112); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); reg0 += __msa_hadd_u_h(vec8, vec8); reg1 += __msa_hadd_u_h(vec9, vec9); reg2 += __msa_hadd_u_h(vec4, vec4); reg3 += __msa_hadd_u_h(vec5, vec5); reg4 += __msa_hadd_u_h(vec0, vec0); reg5 += __msa_hadd_u_h(vec1, vec1); reg0 += const_0x0001; reg1 += const_0x0001; reg2 += const_0x0001; reg3 += const_0x0001; reg4 += const_0x0001; reg5 += const_0x0001; reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 1); reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 1); reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 1); reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 1); reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 1); reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 1); reg6 = reg0 * const_0x70; reg7 = reg1 * const_0x70; reg8 = reg2 * const_0x4A; reg9 = reg3 * const_0x4A; reg6 += const_0x8080; reg7 += const_0x8080; reg8 += reg4 * const_0x26; reg9 += reg5 * const_0x26; reg0 *= const_0x12; reg1 *= const_0x12; reg2 *= const_0x5E; reg3 *= const_0x5E; reg4 *= const_0x70; reg5 *= const_0x70; reg2 += reg0; reg3 += reg1; reg4 += const_0x8080; reg5 += const_0x8080; reg6 -= reg8; reg7 -= reg9; reg4 -= reg2; reg5 -= reg3; reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8); reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8); reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8); reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6); dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); ST_UB(dst0, dst_u); ST_UB(dst1, dst_v); src_argb += 128; src_argb_next += 128; dst_u += 16; dst_v += 16; } } void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1, dst2; v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20}; v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25}; v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 26, 28, 29, 30}; for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); ST_UB2(dst0, dst1, dst_rgb, 16); ST_UB(dst2, (dst_rgb + 32)); src_argb += 64; dst_rgb += 48; } } void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1, dst2; v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22}; v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22, 21, 20, 26, 25}; v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22, 21, 20, 26, 25, 24, 30, 29, 28}; for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); ST_UB2(dst0, dst1, dst_rgb, 16); ST_UB(dst2, (dst_rgb + 32)); src_argb += 64; dst_rgb += 48; } } void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; v16u8 src0, src1, dst0; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v16i8 zero = {0}; for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3); vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5); vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3); vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3); vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5); vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1); vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2); vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2); vec0 = __msa_binsli_b(vec0, vec1, 2); vec1 = __msa_binsli_b(vec2, vec3, 4); vec4 = __msa_binsli_b(vec4, vec5, 2); vec5 = __msa_binsli_b(vec6, vec7, 4); vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4); dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0); ST_UB(dst0, dst_rgb); src_argb += 32; dst_rgb += 16; } } void ARGBToARGB1555Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; v16u8 src0, src1, dst0; v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v16i8 zero = {0}; for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2); vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3); vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1); vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3); vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2); vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3); vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1); vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1); vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2); vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2); vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3); vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3); vec0 = __msa_binsli_b(vec0, vec1, 2); vec5 = __msa_binsli_b(vec5, vec6, 2); vec1 = __msa_binsli_b(vec2, vec3, 5); vec6 = __msa_binsli_b(vec7, vec8, 5); vec1 = __msa_binsli_b(vec1, vec4, 0); vec6 = __msa_binsli_b(vec6, vec9, 0); vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5); dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); ST_UB(dst0, dst_rgb); src_argb += 32; dst_rgb += 16; } } void ARGBToARGB4444Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { int x; v16u8 src0, src1; v16u8 vec0, vec1; v16u8 dst0; v16i8 zero = {0}; for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4); vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4); src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1); src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1); vec0 = __msa_binsli_b(vec0, src0, 3); vec1 = __msa_binsli_b(vec1, src1, 3); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_rgb); src_argb += 32; dst_rgb += 16; } } void ARGBToUV444Row_MSA(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int32_t width) { int32_t x; v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8u16 vec8, vec9, vec10, vec11; v8u16 const_112 = (v8u16)__msa_ldi_h(112); v8u16 const_74 = (v8u16)__msa_ldi_h(74); v8u16 const_38 = (v8u16)__msa_ldi_h(38); v8u16 const_94 = (v8u16)__msa_ldi_h(94); v8u16 const_18 = (v8u16)__msa_ldi_h(18); v8u16 const_32896 = (v8u16)__msa_fill_h(32896); v16i8 zero = {0}; for (x = width; x > 0; x -= 16) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0); vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); vec10 = vec0 * const_18; vec11 = vec1 * const_18; vec8 = vec2 * const_94; vec9 = vec3 * const_94; vec6 = vec4 * const_112; vec7 = vec5 * const_112; vec0 *= const_112; vec1 *= const_112; vec2 *= const_74; vec3 *= const_74; vec4 *= const_38; vec5 *= const_38; vec8 += vec10; vec9 += vec11; vec6 += const_32896; vec7 += const_32896; vec0 += const_32896; vec1 += const_32896; vec2 += vec4; vec3 += vec5; vec0 -= vec2; vec1 -= vec3; vec6 -= vec8; vec7 -= vec9; vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8); vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); ST_UB(dst0, dst_u); ST_UB(dst1, dst_v); src_argb += 64; dst_u += 16; dst_v += 16; } } void ARGBMultiplyRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, dst0; v8u16 vec0, vec1, vec2, vec3; v4u32 reg0, reg1, reg2, reg3; v8i16 zero = {0}; for (x = 0; x < width; x += 4) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb1, 0); vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16); reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16); reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16); reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16); vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_argb); src_argb += 16; src_argb1 += 16; dst_argb += 16; } } void ARGBAddRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); dst0 = __msa_adds_u_b(src0, src2); dst1 = __msa_adds_u_b(src1, src3); ST_UB2(dst0, dst1, dst_argb, 16); src_argb += 32; src_argb1 += 32; dst_argb += 32; } } void ARGBSubtractRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); dst0 = __msa_subs_u_b(src0, src2); dst1 = __msa_subs_u_b(src1, src3); ST_UB2(dst0, dst1, dst_argb, 16); src_argb += 32; src_argb1 += 32; dst_argb += 32; } } void ARGBAttenuateRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, dst0, dst1; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; v8i16 zero = {0}; v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1); vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1); vec4 = (v8u16)__msa_fill_h(vec0[3]); vec5 = (v8u16)__msa_fill_h(vec0[7]); vec6 = (v8u16)__msa_fill_h(vec1[3]); vec7 = (v8u16)__msa_fill_h(vec1[7]); vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); vec6 = (v8u16)__msa_fill_h(vec2[3]); vec7 = (v8u16)__msa_fill_h(vec2[7]); vec8 = (v8u16)__msa_fill_h(vec3[3]); vec9 = (v8u16)__msa_fill_h(vec3[7]); vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4); reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4); reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5); reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5); reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6); reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6); reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7); reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7); reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24); reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24); reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24); reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24); vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); dst0 = __msa_bmnz_v(dst0, src0, mask); dst1 = __msa_bmnz_v(dst1, src1, mask); ST_UB2(dst0, dst1, dst_argb, 16); src_argb += 32; dst_argb += 32; } } void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, uint32_t dither4, int width) { int x; v16u8 src0, src1, dst0, vec0, vec1; v8i16 vec_d0; v8i16 reg0, reg1, reg2; v16i8 zero = {0}; v8i16 max = __msa_ldi_h(0xFF); vec_d0 = (v8i16)__msa_fill_w(dither4); vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0); for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0); reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1); reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0); reg0 += vec_d0; reg1 += vec_d0; reg2 += vec_d0; reg0 = __msa_maxi_s_h((v8i16)reg0, 0); reg1 = __msa_maxi_s_h((v8i16)reg1, 0); reg2 = __msa_maxi_s_h((v8i16)reg2, 0); reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0); reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1); reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2); reg0 = __msa_srai_h(reg0, 3); reg2 = __msa_srai_h(reg2, 3); reg1 = __msa_srai_h(reg1, 2); reg2 = __msa_slli_h(reg2, 11); reg1 = __msa_slli_h(reg1, 5); reg0 |= reg1; dst0 = (v16u8)(reg0 | reg2); ST_UB(dst0, dst_rgb); src_argb += 32; dst_rgb += 16; } } void ARGBShuffleRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { int x; v16u8 src0, src1, dst0, dst1; v16i8 vec0; v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; int32_t val = LW((int32_t*)shuffler); vec0 = (v16i8)__msa_fill_w(val); shuffler_vec += vec0; for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0); dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1); ST_UB2(dst0, dst1, dst_argb, 16); src_argb += 32; dst_argb += 32; } } void ARGBShadeRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { int x; v16u8 src0, dst0; v8u16 vec0, vec1; v4u32 reg0, reg1, reg2, reg3, rgba_scale; v8i16 zero = {0}; rgba_scale[0] = value; rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale); rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale); for (x = 0; x < width; x += 4) { src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); reg0 *= rgba_scale; reg1 *= rgba_scale; reg2 *= rgba_scale; reg3 *= rgba_scale; reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_argb); src_argb += 16; dst_argb += 16; } } void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, vec0, vec1, dst0, dst1; v8u16 reg0; v16u8 const_0x4D = (v16u8)__msa_ldi_h(0x4D); v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); reg0 = __msa_dotp_u_h(vec0, const_0x961D); reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x4D); reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 8); vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); ST_UB2(dst0, dst1, dst_argb, 16); src_argb += 32; dst_argb += 32; } } void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; v8u16 reg0, reg1, reg2; v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411); v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23); v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816); v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D); v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218); v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32); v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF); for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0); src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16); vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1); reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411); reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816); reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218); reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23); reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D); reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32); reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7); reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7); reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7); reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF); reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF); vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0); vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1); vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2); vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); ST_UB2(dst0, dst1, dst_argb, 16); dst_argb += 32; } } void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1; v8u16 vec0, vec1, vec2, vec3; v16u8 dst0, dst1, dst2, dst3; for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0); src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16); vec0 = (v8u16)__msa_andi_b(src0, 0x0F); vec1 = (v8u16)__msa_andi_b(src1, 0x0F); vec2 = (v8u16)__msa_andi_b(src0, 0xF0); vec3 = (v8u16)__msa_andi_b(src1, 0xF0); vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4); vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4); vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4); vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4); dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); src_argb4444 += 32; dst_argb += 64; } } void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { int x; v8u16 src0, src1; v8u16 vec0, vec1, vec2, vec3, vec4, vec5; v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6; v16u8 dst0, dst1, dst2, dst3; v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); for (x = 0; x < width; x += 16) { src0 = (v8u16)__msa_ld_h((void*)src_argb1555, 0); src1 = (v8u16)__msa_ld_h((void*)src_argb1555, 16); vec0 = src0 & const_0x1F; vec1 = src1 & const_0x1F; src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); vec2 = src0 & const_0x1F; vec3 = src1 & const_0x1F; src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); vec4 = src0 & const_0x1F; vec5 = src1 & const_0x1F; src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3); reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3); reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3); reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2); reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2); reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2); reg3 = -reg3; reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4); reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4); reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5); reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5); dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0); dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0); dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1); dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1); ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); src_argb1555 += 32; dst_argb += 64; } } void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { int x; v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; v8u16 reg0, reg1, reg2, reg3, reg4, reg5; v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); for (x = 0; x < width; x += 16) { src0 = (v8u16)__msa_ld_h((void*)src_rgb565, 0); src1 = (v8u16)__msa_ld_h((void*)src_rgb565, 16); vec0 = src0 & const_0x1F; vec1 = src0 & const_0x7E0; vec2 = src0 & const_0xF800; vec3 = src1 & const_0x1F; vec4 = src1 & const_0x7E0; vec5 = src1 & const_0xF800; reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0); res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1); res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3); res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4); dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2); dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2); ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); src_rgb565 += 32; dst_argb += 64; } } void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, src2; v16u8 vec0, vec1, vec2; v16u8 dst0, dst1, dst2, dst3; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_rgb24, 0); src1 = (v16u8)__msa_ld_b((void*)src_rgb24, 16); src2 = (v16u8)__msa_ld_b((void*)src_rgb24, 32); vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0); dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0); dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1); dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2); ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); src_rgb24 += 48; dst_argb += 64; } } void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, src2; v16u8 vec0, vec1, vec2; v16u8 dst0, dst1, dst2, dst3; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19}; for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_raw, 0); src1 = (v16u8)__msa_ld_b((void*)src_raw, 16); src2 = (v16u8)__msa_ld_b((void*)src_raw, 32); vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0); dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0); dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1); dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2); ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); src_raw += 48; dst_argb += 64; } } void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { int x; v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; v8u16 reg0, reg1, reg2, reg3, reg4, reg5; v16u8 dst0; v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { src0 = (v8u16)__msa_ld_b((void*)src_argb1555, 0); src1 = (v8u16)__msa_ld_b((void*)src_argb1555, 16); vec0 = src0 & const_0x1F; vec1 = src1 & const_0x1F; src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); vec2 = src0 & const_0x1F; vec3 = src1 & const_0x1F; src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); vec4 = src0 & const_0x1F; vec5 = src1 & const_0x1F; reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3); reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2); reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2); reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3); reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2); reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2); reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3); reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3); reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2); reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2); reg0 *= const_0x19; reg1 *= const_0x19; reg2 *= const_0x81; reg3 *= const_0x81; reg4 *= const_0x42; reg5 *= const_0x42; reg0 += reg2; reg1 += reg3; reg0 += reg4; reg1 += reg5; reg0 += const_0x1080; reg1 += const_0x1080; reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); ST_UB(dst0, dst_y); src_argb1555 += 32; dst_y += 16; } } void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { int x; v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8u16 reg0, reg1, reg2, reg3, reg4, reg5; v4u32 res0, res1, res2, res3; v16u8 dst0; v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019); v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042); v8i16 const_0x1080 = __msa_fill_h(0x1080); v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); for (x = 0; x < width; x += 16) { src0 = (v8u16)__msa_ld_b((void*)src_rgb565, 0); src1 = (v8u16)__msa_ld_b((void*)src_rgb565, 16); vec0 = src0 & const_0x1F; vec1 = src0 & const_0x7E0; vec2 = src0 & const_0xF800; vec3 = src1 & const_0x1F; vec4 = src1 & const_0x7E0; vec5 = src1 & const_0xF800; reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0); vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0); vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3); vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3); vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2); vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2); vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5); vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5); res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019); res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019); res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019); res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019); res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042); res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042); res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042); res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042); res0 = (v4u32)__msa_srai_w((v4i32)res0, 8); res1 = (v4u32)__msa_srai_w((v4i32)res1, 8); res2 = (v4u32)__msa_srai_w((v4i32)res2, 8); res3 = (v4u32)__msa_srai_w((v4i32)res3, 8); vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0); vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_y); src_rgb565 += 32; dst_y += 16; } } void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; v8u16 vec0, vec1, vec2, vec3; v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119); v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42); v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, 18, 19, 20, 21, 21, 22, 23, 24}; v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; v16i8 zero = {0}; for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119); vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119); vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42); vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42); vec0 += const_0x1080; vec1 += const_0x1080; vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_y); src_argb += 48; dst_y += 16; } } void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; v8u16 vec0, vec1, vec2, vec3; v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142); v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19); v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, 18, 19, 20, 21, 21, 22, 23, 24}; v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; v16i8 zero = {0}; for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142); vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142); vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19); vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19); vec0 += const_0x1080; vec1 += const_0x1080; vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_y); src_argb += 48; dst_y += 16; } } void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; const uint16_t* s = (const uint16_t*)src_argb1555; const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555); int64_t res0, res1; v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; v16u8 dst0; v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); for (x = 0; x < width; x += 16) { src0 = (v8u16)__msa_ld_b((void*)s, 0); src1 = (v8u16)__msa_ld_b((void*)s, 16); src2 = (v8u16)__msa_ld_b((void*)t, 0); src3 = (v8u16)__msa_ld_b((void*)t, 16); vec0 = src0 & const_0x1F; vec1 = src1 & const_0x1F; vec0 += src2 & const_0x1F; vec1 += src3 & const_0x1F; vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); vec2 = src0 & const_0x1F; vec3 = src1 & const_0x1F; vec2 += src2 & const_0x1F; vec3 += src3 & const_0x1F; vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); vec4 = src0 & const_0x1F; vec5 = src1 & const_0x1F; vec4 += src2 & const_0x1F; vec5 += src3 & const_0x1F; vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1); vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1); vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1); vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6); reg0 = vec6 * const_0x70; reg1 = vec0 * const_0x4A; reg2 = vec2 * const_0x70; reg3 = vec0 * const_0x5E; reg0 += const_0x8080; reg1 += vec2 * const_0x26; reg2 += const_0x8080; reg3 += vec6 * const_0x12; reg0 -= reg1; reg2 -= reg3; reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); res0 = __msa_copy_u_d((v2i64)dst0, 0); res1 = __msa_copy_u_d((v2i64)dst0, 1); SD(res0, dst_u); SD(res1, dst_v); s += 16; t += 16; dst_u += 8; dst_v += 8; } } void RGB565ToUVRow_MSA(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; const uint16_t* s = (const uint16_t*)src_rgb565; const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565); int64_t res0, res1; v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; v8u16 vec0, vec1, vec2, vec3, vec4, vec5; v16u8 dst0; v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080); v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F); for (x = 0; x < width; x += 16) { src0 = (v8u16)__msa_ld_b((void*)s, 0); src1 = (v8u16)__msa_ld_b((void*)s, 16); src2 = (v8u16)__msa_ld_b((void*)t, 0); src3 = (v8u16)__msa_ld_b((void*)t, 16); vec0 = src0 & const_0x1F; vec1 = src1 & const_0x1F; vec0 += src2 & const_0x1F; vec1 += src3 & const_0x1F; vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); vec2 = src0 & const_0x3F; vec3 = src1 & const_0x3F; vec2 += src2 & const_0x3F; vec3 += src3 & const_0x3F; vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); src0 = (v8u16)__msa_srai_h((v8i16)src0, 6); src1 = (v8u16)__msa_srai_h((v8i16)src1, 6); src2 = (v8u16)__msa_srai_h((v8i16)src2, 6); src3 = (v8u16)__msa_srai_h((v8i16)src3, 6); vec4 = src0 & const_0x1F; vec5 = src1 & const_0x1F; vec4 += src2 & const_0x1F; vec5 += src3 & const_0x1F; vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1); vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1); vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); reg0 = vec3 * const_0x70; reg1 = vec1 * const_0x4A; reg2 = vec4 * const_0x70; reg3 = vec1 * const_0x5E; reg0 += const_32896; reg1 += vec4 * const_0x26; reg2 += const_32896; reg3 += vec3 * const_0x12; reg0 -= reg1; reg2 -= reg3; reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); res0 = __msa_copy_u_d((v2i64)dst0, 0); res1 = __msa_copy_u_d((v2i64)dst0, 1); SD(res0, dst_u); SD(res1, dst_v); s += 16; t += 16; dst_u += 8; dst_v += 8; } } void RGB24ToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; const uint8_t* s = src_rgb; const uint8_t* t = src_rgb + src_stride_rgb; int64_t res0, res1; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v16u8 inp0, inp1, inp2, inp3, inp4, inp5; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 reg0, reg1, reg2, reg3; v16u8 dst0; v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; v16i8 zero = {0}; for (x = 0; x < width; x += 16) { inp0 = (v16u8)__msa_ld_b((void*)s, 0); inp1 = (v16u8)__msa_ld_b((void*)s, 16); inp2 = (v16u8)__msa_ld_b((void*)s, 32); inp3 = (v16u8)__msa_ld_b((void*)t, 0); inp4 = (v16u8)__msa_ld_b((void*)t, 16); inp5 = (v16u8)__msa_ld_b((void*)t, 32); src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); reg0 += const_0x0001; reg1 += const_0x0001; reg2 += const_0x0001; reg3 += const_0x0001; reg0 = __msa_srai_h((v8i16)reg0, 1); reg1 = __msa_srai_h((v8i16)reg1, 1); reg2 = __msa_srai_h((v8i16)reg2, 1); reg3 = __msa_srai_h((v8i16)reg3, 1); vec4 = (v8u16)__msa_pckev_h(reg1, reg0); vec5 = (v8u16)__msa_pckev_h(reg3, reg2); vec6 = (v8u16)__msa_pckod_h(reg1, reg0); vec7 = (v8u16)__msa_pckod_h(reg3, reg2); vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); vec3 = vec0 * const_0x70; vec4 = vec1 * const_0x4A; vec5 = vec2 * const_0x26; vec2 *= const_0x70; vec1 *= const_0x5E; vec0 *= const_0x12; reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); reg0 += reg1; reg2 += reg3; reg0 = __msa_srai_h(reg0, 8); reg2 = __msa_srai_h(reg2, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); res0 = __msa_copy_u_d((v2i64)dst0, 0); res1 = __msa_copy_u_d((v2i64)dst0, 1); SD(res0, dst_u); SD(res1, dst_v); t += 48; s += 48; dst_u += 8; dst_v += 8; } } void RAWToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; const uint8_t* s = src_rgb; const uint8_t* t = src_rgb + src_stride_rgb; int64_t res0, res1; v16u8 inp0, inp1, inp2, inp3, inp4, inp5; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 reg0, reg1, reg2, reg3; v16u8 dst0; v8u16 const_0x70 = (v8u16)__msa_fill_h(0x38); v8u16 const_0x4A = (v8u16)__msa_fill_h(0x25); v8u16 const_0x26 = (v8u16)__msa_fill_h(0x13); v8u16 const_0x5E = (v8u16)__msa_fill_h(0x2f); v8u16 const_0x12 = (v8u16)__msa_fill_h(0x09); v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; v16i8 zero = {0}; for (x = 0; x < width; x += 16) { inp0 = (v16u8)__msa_ld_b((void*)s, 0); inp1 = (v16u8)__msa_ld_b((void*)s, 16); inp2 = (v16u8)__msa_ld_b((void*)s, 32); inp3 = (v16u8)__msa_ld_b((void*)t, 0); inp4 = (v16u8)__msa_ld_b((void*)t, 16); inp5 = (v16u8)__msa_ld_b((void*)t, 32); src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); reg0 += const_0x0001; reg1 += const_0x0001; reg2 += const_0x0001; reg3 += const_0x0001; reg0 = __msa_srai_h(reg0, 1); reg1 = __msa_srai_h(reg1, 1); reg2 = __msa_srai_h(reg2, 1); reg3 = __msa_srai_h(reg3, 1); vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); vec3 = vec0 * const_0x70; vec4 = vec1 * const_0x4A; vec5 = vec2 * const_0x26; vec2 *= const_0x70; vec1 *= const_0x5E; vec0 *= const_0x12; reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); reg0 += reg1; reg2 += reg3; reg0 = __msa_srai_h(reg0, 8); reg2 = __msa_srai_h(reg2, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); res0 = __msa_copy_u_d((v2i64)dst0, 0); res1 = __msa_copy_u_d((v2i64)dst0, 1); SD(res0, dst_u); SD(res1, dst_v); t += 48; s += 48; dst_u += 8; dst_v += 8; } } void NV12ToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; uint64_t val0, val1; v16u8 src0, src1, res0, res1, dst0, dst1; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; v16u8 zero = {0}; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { val0 = LD(src_y); val1 = LD(src_uv); src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); ST_UB2(dst0, dst1, dst_argb, 16); src_y += 8; src_uv += 8; dst_argb += 32; } } void NV12ToRGB565Row_MSA(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { int x; uint64_t val0, val1; v16u8 src0, src1, dst0; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; v16u8 zero = {0}; YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { val0 = LD(src_y); val1 = LD(src_uv); src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); vec0 = vec0 >> 3; vec1 = (vec1 >> 2) << 5; vec2 = (vec2 >> 3) << 11; dst0 = (v16u8)(vec0 | vec1 | vec2); ST_UB(dst0, dst_rgb565); src_y += 8; src_uv += 8; dst_rgb565 += 16; } } void NV21ToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; uint64_t val0, val1; v16u8 src0, src1, res0, res1, dst0, dst1; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v16u8 zero = {0}; v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { val0 = LD(src_y); val1 = LD(src_vu); src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); ST_UB2(dst0, dst1, dst_argb, 16); src_y += 8; src_vu += 8; dst_argb += 32; } } void SobelRow_MSA(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; v16i8 const_0x4 = __msa_ldi_b(0x4); v16i8 mask1 = mask0 + const_0x4; v16i8 mask2 = mask1 + const_0x4; v16i8 mask3 = mask2 + const_0x4; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0); vec0 = __msa_adds_u_b(src0, src1); dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0); dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0); dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0); dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0); ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); src_sobelx += 16; src_sobely += 16; dst_argb += 64; } } void SobelToPlaneRow_MSA(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; for (x = 0; x < width; x += 32) { src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); src1 = (v16u8)__msa_ld_b((void*)src_sobelx, 16); src2 = (v16u8)__msa_ld_b((void*)src_sobely, 0); src3 = (v16u8)__msa_ld_b((void*)src_sobely, 16); dst0 = __msa_adds_u_b(src0, src2); dst1 = __msa_adds_u_b(src1, src3); ST_UB2(dst0, dst1, dst_y, 16); src_sobelx += 32; src_sobely += 32; dst_y += 32; } } void SobelXYRow_MSA(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, vec0, vec1, vec2; v16u8 reg0, reg1, dst0, dst1, dst2, dst3; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_sobelx, 0); src1 = (v16u8)__msa_ld_b((void*)src_sobely, 0); vec0 = __msa_adds_u_b(src0, src1); vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0); reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0); dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); src_sobelx += 16; src_sobely += 16; dst_argb += 64; } } void ARGBToYJRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x961D = (v16u8)__msa_fill_h(0x961D); v16u8 const_0x4D = (v16u8)__msa_fill_h(0x4D); v8u16 const_0x80 = (v8u16)__msa_fill_h(0x80); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x961D, const_0x4D, const_0x80, 8, dst0); ST_UB(dst0, dst_y); src_argb += 64; dst_y += 16; } } void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981); v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); src_argb += 64; dst_y += 16; } } void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19); v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); src_argb += 64; dst_y += 16; } } void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width) { int x; v16u8 src0, src1, src2, src3, dst0; v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281); v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, dst0); ST_UB(dst0, dst_y); src_argb += 64; dst_y += 16; } } void ARGBToUVJRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; const uint8_t* s = src_rgb; const uint8_t* t = src_rgb + src_stride_rgb; v8u16 src0, src1, src2, src3, src4, src5, src6, src7; v8u16 vec0, vec1, vec2, vec3; v8u16 dst0, dst1, dst2, dst3; v16u8 zero = {0}; v8i16 shuffler0 = {0, 3, 4, 7, 8, 11, 12, 15}; v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; v8i16 shuffler2 = {2, 3, 6, 7, 10, 11, 14, 15}; v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; v8u16 const_0x0000003f = (v8u16)__msa_fill_w(0x0000003f); v4u32 const_0x00008080 = (v8u16)__msa_fill_w(0x00008080); v8u16 const_0x0015002a = (v8u16)__msa_fill_w(0x0015002a); v8u16 const_0x0035000a = (v8u16)__msa_fill_w(0x0035000a); v4i32 shift = __msa_fill_w(0x00000008); for (x = 0; x < width; x += 32) { src1 = __msa_ld_b((void*)s, 0); src3 = __msa_ld_b((void*)s, 16); src5 = __msa_ld_b((void*)t, 0); src7 = __msa_ld_b((void*)t, 16); src0 = __msa_ilvr_b(zero, src1); src1 = __msa_ilvl_b(zero, src1); src2 = __msa_ilvr_b(zero, src3); src3 = __msa_ilvl_b(zero, src3); src4 = __msa_ilvr_b(zero, src5); src5 = __msa_ilvl_b(zero, src5); src6 = __msa_ilvr_b(zero, src7); src7 = __msa_ilvl_b(zero, src7); src0 += src4; src1 += src5; src2 += src6; src3 += src7; src4 = __msa_ilvev_d(src1, src0); src5 = __msa_ilvod_d(src1, src0); src6 = __msa_ilvev_d(src3, src2); src7 = __msa_ilvod_d(src3, src2); vec0 = __msa_aver_u_h(src4, src5); vec1 = __msa_aver_u_h(src6, src7); src1 = __msa_ld_b((void*)s, 32); src3 = __msa_ld_b((void*)s, 48); src5 = __msa_ld_b((void*)t, 32); src7 = __msa_ld_b((void*)t, 48); src0 = __msa_ilvr_b(zero, src1); src1 = __msa_ilvl_b(zero, src1); src2 = __msa_ilvr_b(zero, src3); src3 = __msa_ilvl_b(zero, src3); src4 = __msa_ilvr_b(zero, src5); src5 = __msa_ilvl_b(zero, src5); src6 = __msa_ilvr_b(zero, src7); src7 = __msa_ilvl_b(zero, src7); src0 += src4; src1 += src5; src2 += src6; src3 += src7; src4 = __msa_ilvev_d(src1, src0); src5 = __msa_ilvod_d(src1, src0); src6 = __msa_ilvev_d(src3, src2); src7 = __msa_ilvod_d(src3, src2); vec2 = __msa_aver_u_h(src4, src5); vec3 = __msa_aver_u_h(src6, src7); ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, shuffler2, shuffler3, shift, dst0, dst1); src1 = __msa_ld_b((void*)s, 64); src3 = __msa_ld_b((void*)s, 80); src5 = __msa_ld_b((void*)t, 64); src7 = __msa_ld_b((void*)t, 80); src0 = __msa_ilvr_b(zero, src1); src1 = __msa_ilvl_b(zero, src1); src2 = __msa_ilvr_b(zero, src3); src3 = __msa_ilvl_b(zero, src3); src4 = __msa_ilvr_b(zero, src5); src5 = __msa_ilvl_b(zero, src5); src6 = __msa_ilvr_b(zero, src7); src7 = __msa_ilvl_b(zero, src7); src0 += src4; src1 += src5; src2 += src6; src3 += src7; src4 = __msa_ilvev_d(src1, src0); src5 = __msa_ilvod_d(src1, src0); src6 = __msa_ilvev_d(src3, src2); src7 = __msa_ilvod_d(src3, src2); vec0 = __msa_aver_u_h(src4, src5); vec1 = __msa_aver_u_h(src6, src7); src1 = __msa_ld_b((void*)s, 96); src3 = __msa_ld_b((void*)s, 112); src5 = __msa_ld_b((void*)t, 96); src7 = __msa_ld_b((void*)t, 112); src0 = __msa_ilvr_b(zero, src1); src1 = __msa_ilvl_b(zero, src1); src2 = __msa_ilvr_b(zero, src3); src3 = __msa_ilvl_b(zero, src3); src4 = __msa_ilvr_b(zero, src5); src5 = __msa_ilvl_b(zero, src5); src6 = __msa_ilvr_b(zero, src7); src7 = __msa_ilvl_b(zero, src7); src0 += src4; src1 += src5; src2 += src6; src3 += src7; src4 = __msa_ilvev_d(src1, src0); src5 = __msa_ilvod_d(src1, src0); src6 = __msa_ilvev_d(src3, src2); src7 = __msa_ilvod_d(src3, src2); vec2 = __msa_aver_u_h(src4, src5); vec3 = __msa_aver_u_h(src6, src7); ARGBTOUV(vec0, vec1, vec2, vec3, const_0x0000003f, const_0x00008080, const_0x0015002a, const_0x0035000a, shuffler0, shuffler1, shuffler2, shuffler3, shift, dst2, dst3); dst0 = (v8u16)__msa_pckev_b(dst2, dst0); dst1 = (v8u16)__msa_pckev_b(dst3, dst1); ST_UB(dst0, dst_u); ST_UB(dst1, dst_v); s += 128; t += 128; dst_v += 16; dst_u += 16; } } void BGRAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; const uint8_t* s = src_rgb; const uint8_t* t = src_rgb + src_stride_rgb; const uint8_t unused = 0xf; v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; v8i16 shuffler0 = {1, unused, 5, unused, 9, unused, 13, unused}; v8i16 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15}; v8i16 shuffler2 = {3, unused, 7, unused, 11, unused, 15, unused}; v8i16 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14}; v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); for (x = 0; x < width; x += 16) { READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, dst1); *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); s += 64; t += 64; dst_u += 8; dst_v += 8; } } void ABGRToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; const uint8_t* s = src_rgb; const uint8_t* t = src_rgb + src_stride_rgb; const uint8_t unused = 0xf; v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; v8i16 shuffler0 = {0, unused, 4, unused, 8, unused, 12, unused}; v8i16 shuffler1 = {1, 2, 5, 6, 9, 10, 13, 14}; v8i16 shuffler2 = {2, unused, 6, unused, 10, unused, 14, unused}; v8i16 shuffler3 = {0, 1, 4, 5, 8, 9, 12, 13}; v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); for (x = 0; x < width; x += 16) { READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, dst1); *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); s += 64; t += 64; dst_u += 8; dst_v += 8; } } void RGBAToUVRow_MSA(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; const uint8_t* s = src_rgb; const uint8_t* t = src_rgb + src_stride_rgb; const uint8_t unused = 0xf; v8u16 src0, src1, src2, src3; v16u8 dst0, dst1; v8i16 shuffler0 = {3, unused, 7, unused, 11, unused, 15, unused}; v8i16 shuffler1 = {2, 1, 6, 5, 10, 9, 14, 13}; v8i16 shuffler2 = {1, unused, 5, unused, 9, unused, 13, unused}; v8i16 shuffler3 = {3, 2, 7, 6, 11, 10, 15, 14}; v8u16 const_0x09002f = (v8u16)__msa_fill_w(0x09002f); v8u16 const_0x000038 = (v8u16)__msa_fill_w(0x0038); v8u16 const_0x250013 = (v8u16)__msa_fill_w(0x250013); v4u32 const_0x008080 = (v4u32)__msa_fill_w(0x8080); v8u16 const_0x0001 = (v8u16)__msa_fill_h(0x0001); for (x = 0; x < width; x += 16) { READ_ARGB(s, t, src0, src1, src2, src3, const_0x0001); ARGBTOUV_H(src0, src1, src2, src3, const_0x09002f, const_0x000038, const_0x250013, const_0x008080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, dst1); *((uint64_t*)dst_v) = __msa_copy_u_d((v2i64)dst0, 0); *((uint64_t*)dst_u) = __msa_copy_u_d((v2i64)dst1, 0); s += 64; t += 64; dst_u += 8; dst_v += 8; } } void I444ToARGBRow_MSA(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; v16u8 src0, src1, src2, dst0, dst1; v8u16 vec0, vec1, vec2; v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v8i16 zero = {0}; YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); for (x = 0; x < width; x += 8) { READI444(src_y, src_u, src_v, src0, src1, src2); vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); reg0 *= vec_yg; reg1 *= vec_yg; reg0 = __msa_srai_w(reg0, 16); reg1 = __msa_srai_w(reg1, 16); reg4 = reg0 + vec_br; reg5 = reg1 + vec_br; reg2 = reg0 + vec_bg; reg3 = reg1 + vec_bg; reg0 += vec_bb; reg1 += vec_bb; vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2); reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); reg0 -= reg6 * vec_ub; reg1 -= reg7 * vec_ub; reg2 -= reg6 * vec_ug; reg3 -= reg7 * vec_ug; reg4 -= reg8 * vec_vr; reg5 -= reg9 * vec_vr; reg2 -= reg8 * vec_vg; reg3 -= reg9 * vec_vg; reg0 = __msa_srai_w(reg0, 6); reg1 = __msa_srai_w(reg1, 6); reg2 = __msa_srai_w(reg2, 6); reg3 = __msa_srai_w(reg3, 6); reg4 = __msa_srai_w(reg4, 6); reg5 = __msa_srai_w(reg5, 6); CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5); vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2); dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); ST_UB2(dst0, dst1, dst_argb, 16); src_y += 8; src_u += 8; src_v += 8; dst_argb += 32; } } // TODO - respect YuvConstants void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; #if defined(__aarch64__) || defined(__arm__) int ygb = yuvconstants->kUVBiasBGR[3]; int yg = yuvconstants->kYToRgb[1]; #else int ygb = yuvconstants->kYBiasToRgb[0]; int yg = yuvconstants->kYToRgb[0]; #endif v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; v8i16 vec0, vec1; v4i32 reg0, reg1, reg2, reg3; v4i32 vec_yg = __msa_fill_w(yg); v8i16 vec_ygb = __msa_fill_h(ygb); v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); v8i16 max = __msa_ldi_h(0xFF); v8i16 zero = {0}; for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_y, 0); vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); reg0 = (v4i32)__msa_ilvr_h(zero, vec0); reg1 = (v4i32)__msa_ilvl_h(zero, vec0); reg2 = (v4i32)__msa_ilvr_h(zero, vec1); reg3 = (v4i32)__msa_ilvl_h(zero, vec1); reg0 *= vec_yg; reg1 *= vec_yg; reg2 *= vec_yg; reg3 *= vec_yg; reg0 = __msa_srai_w(reg0, 16); reg1 = __msa_srai_w(reg1, 16); reg2 = __msa_srai_w(reg2, 16); reg3 = __msa_srai_w(reg3, 16); vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); vec0 += vec_ygb; vec1 += vec_ygb; vec0 = __msa_srai_h(vec0, 6); vec1 = __msa_srai_h(vec1, 6); vec0 = __msa_maxi_s_h(vec0, 0); vec1 = __msa_maxi_s_h(vec1, 0); vec0 = __msa_min_s_h(max, vec0); vec1 = __msa_min_s_h(max, vec1); res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0); res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0); res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0); res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0); dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1); dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1); dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2); dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2); ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); src_y += 16; dst_argb += 64; } } void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { int x; v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_y, 0); vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0); vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0); vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0); vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0); dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); src_y += 16; dst_argb += 64; } } void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((void*)src_yuy2, 0); src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_yuy2 += 16; dst_argb += 32; } } void UYVYToARGBRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { int x; v16u8 src0, src1, src2; v8i16 vec0, vec1, vec2; v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; v4i32 vec_ubvr, vec_ugvg; v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg); vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((void*)src_uyvy, 0); src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, vec0, vec1, vec2); STOREARGB(vec0, vec1, vec2, alpha, dst_argb); src_uyvy += 16; dst_argb += 32; } } void InterpolateRow_MSA(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int width, int32_t source_y_fraction) { int32_t y1_fraction = source_y_fraction; int32_t y0_fraction = 256 - y1_fraction; uint16_t y_fractions; const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; int x; v16u8 src0, src1, src2, src3, dst0, dst1; v8u16 vec0, vec1, vec2, vec3, y_frac; if (0 == y1_fraction) { memcpy(dst_ptr, src_ptr, width); return; } if (128 == y1_fraction) { for (x = 0; x < width; x += 32) { src0 = (v16u8)__msa_ld_b((void*)s, 0); src1 = (v16u8)__msa_ld_b((void*)s, 16); src2 = (v16u8)__msa_ld_b((void*)t, 0); src3 = (v16u8)__msa_ld_b((void*)t, 16); dst0 = __msa_aver_u_b(src0, src2); dst1 = __msa_aver_u_b(src1, src3); ST_UB2(dst0, dst1, dst_ptr, 16); s += 32; t += 32; dst_ptr += 32; } return; } y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8)); y_frac = (v8u16)__msa_fill_h(y_fractions); for (x = 0; x < width; x += 32) { src0 = (v16u8)__msa_ld_b((void*)s, 0); src1 = (v16u8)__msa_ld_b((void*)s, 16); src2 = (v16u8)__msa_ld_b((void*)t, 0); src3 = (v16u8)__msa_ld_b((void*)t, 16); vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac); vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac); vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac); vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac); vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8); vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8); vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8); vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); ST_UB2(dst0, dst1, dst_ptr, 16); s += 32; t += 32; dst_ptr += 32; } } void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) { int x; v4i32 dst0 = __builtin_msa_fill_w(v32); for (x = 0; x < width; x += 4) { ST_UB(dst0, dst_argb); dst_argb += 16; } } void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { int x; v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17}; v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13, 18, 17, 16, 21, 20, 19, 24, 23}; v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25, 24, 23, 28, 27, 26, 31, 30, 29}; for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_raw, 0); src1 = (v16u8)__msa_ld_b((void*)src_raw, 16); src2 = (v16u8)__msa_ld_b((void*)src_raw, 32); src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8); src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3); dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1); ST_UB2(dst0, dst1, dst_rgb24, 16); ST_UB(dst2, (dst_rgb24 + 32)); src_raw += 48; dst_rgb24 += 48; } } void MergeUVRow_MSA(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { int x; v16u8 src0, src1, dst0, dst1; for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_u, 0); src1 = (v16u8)__msa_ld_b((void*)src_v, 0); dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); ST_UB2(dst0, dst1, dst_uv, 16); src_u += 16; src_v += 16; dst_uv += 32; } } void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb, uint8_t* dst_a, int width) { int i; v16u8 src0, src1, src2, src3, vec0, vec1, dst0; for (i = 0; i < width; i += 16) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb, 32); src3 = (v16u8)__msa_ld_b((void*)src_argb, 48); vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_a); src_argb += 64; dst_a += 16; } } void ARGBBlendRow_MSA(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8u16 vec8, vec9, vec10, vec11, vec12, vec13; v8u16 const_256 = (v8u16)__msa_ldi_h(256); v16u8 const_255 = (v16u8)__msa_ldi_b(255); v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; v16i8 zero = {0}; for (x = 0; x < width; x += 8) { src0 = (v16u8)__msa_ld_b((void*)src_argb, 0); src1 = (v16u8)__msa_ld_b((void*)src_argb, 16); src2 = (v16u8)__msa_ld_b((void*)src_argb1, 0); src3 = (v16u8)__msa_ld_b((void*)src_argb1, 16); vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3); vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3); vec8 = (v8u16)__msa_fill_h(vec0[3]); vec9 = (v8u16)__msa_fill_h(vec0[7]); vec10 = (v8u16)__msa_fill_h(vec1[3]); vec11 = (v8u16)__msa_fill_h(vec1[7]); vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); vec10 = (v8u16)__msa_fill_h(vec2[3]); vec11 = (v8u16)__msa_fill_h(vec2[7]); vec12 = (v8u16)__msa_fill_h(vec3[3]); vec13 = (v8u16)__msa_fill_h(vec3[7]); vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10); vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12); vec8 = const_256 - vec8; vec9 = const_256 - vec9; vec10 = const_256 - vec10; vec11 = const_256 - vec11; vec8 *= vec4; vec9 *= vec5; vec10 *= vec6; vec11 *= vec7; vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8); vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8); vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8); vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); dst2 = (v16u8)__msa_pckev_b((v16i8)vec9, (v16i8)vec8); dst3 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); dst0 = (v16u8)__msa_adds_u_b(dst0, dst2); dst1 = (v16u8)__msa_adds_u_b(dst1, dst3); dst0 = __msa_bmnz_v(dst0, const_255, mask); dst1 = __msa_bmnz_v(dst1, const_255, mask); ST_UB2(dst0, dst1, dst_argb, 16); src_argb += 32; src_argb1 += 32; dst_argb += 32; } } void ARGBQuantizeRow_MSA(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; v4i32 vec_scale = __msa_fill_w(scale); v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size); v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset); v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31}; v16i8 zero = {0}; for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)dst_argb, 0); src1 = (v16u8)__msa_ld_b((void*)dst_argb, 16); src2 = (v16u8)__msa_ld_b((void*)dst_argb, 32); src3 = (v16u8)__msa_ld_b((void*)dst_argb, 48); vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0); vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0); vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3); vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3); tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2); tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2); tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3); tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3); tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4); tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4); tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5); tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5); tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6); tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6); tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7); tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7); tmp0 *= vec_scale; tmp1 *= vec_scale; tmp2 *= vec_scale; tmp3 *= vec_scale; tmp4 *= vec_scale; tmp5 *= vec_scale; tmp6 *= vec_scale; tmp7 *= vec_scale; tmp8 *= vec_scale; tmp9 *= vec_scale; tmp10 *= vec_scale; tmp11 *= vec_scale; tmp12 *= vec_scale; tmp13 *= vec_scale; tmp14 *= vec_scale; tmp15 *= vec_scale; tmp0 >>= 16; tmp1 >>= 16; tmp2 >>= 16; tmp3 >>= 16; tmp4 >>= 16; tmp5 >>= 16; tmp6 >>= 16; tmp7 >>= 16; tmp8 >>= 16; tmp9 >>= 16; tmp10 >>= 16; tmp11 >>= 16; tmp12 >>= 16; tmp13 >>= 16; tmp14 >>= 16; tmp15 >>= 16; vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); dst0 *= vec_int_sz; dst1 *= vec_int_sz; dst2 *= vec_int_sz; dst3 *= vec_int_sz; dst0 += vec_int_ofst; dst1 += vec_int_ofst; dst2 += vec_int_ofst; dst3 += vec_int_ofst; dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0); dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1); dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2); dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3); ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); dst_argb += 64; } } void ARGBColorMatrixRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { int32_t x; v16i8 src0; v16u8 src1, src2, dst0, dst1; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; v16i8 zero = {0}; v8i16 max = __msa_ldi_h(255); src0 = __msa_ld_b((void*)matrix_argb, 0); vec0 = (v8i16)__msa_ilvr_b(zero, src0); vec1 = (v8i16)__msa_ilvl_b(zero, src0); for (x = 0; x < width; x += 8) { src1 = (v16u8)__msa_ld_b((void*)src_argb, 0); src2 = (v16u8)__msa_ld_b((void*)src_argb, 16); vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1); vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1); vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2); vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2); vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2); vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3); vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4); vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5); vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2); vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3); vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4); vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5); vec10 = vec2 * vec0; vec11 = vec2 * vec1; vec12 = vec6 * vec0; vec13 = vec6 * vec1; tmp0 = __msa_hadd_s_w(vec10, vec10); tmp1 = __msa_hadd_s_w(vec11, vec11); tmp2 = __msa_hadd_s_w(vec12, vec12); tmp3 = __msa_hadd_s_w(vec13, vec13); vec14 = vec3 * vec0; vec15 = vec3 * vec1; vec16 = vec7 * vec0; vec17 = vec7 * vec1; tmp4 = __msa_hadd_s_w(vec14, vec14); tmp5 = __msa_hadd_s_w(vec15, vec15); tmp6 = __msa_hadd_s_w(vec16, vec16); tmp7 = __msa_hadd_s_w(vec17, vec17); vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); tmp0 = __msa_hadd_s_w(vec10, vec10); tmp1 = __msa_hadd_s_w(vec11, vec11); tmp2 = __msa_hadd_s_w(vec12, vec12); tmp3 = __msa_hadd_s_w(vec13, vec13); tmp0 = __msa_srai_w(tmp0, 6); tmp1 = __msa_srai_w(tmp1, 6); tmp2 = __msa_srai_w(tmp2, 6); tmp3 = __msa_srai_w(tmp3, 6); vec2 = vec4 * vec0; vec6 = vec4 * vec1; vec3 = vec8 * vec0; vec7 = vec8 * vec1; tmp8 = __msa_hadd_s_w(vec2, vec2); tmp9 = __msa_hadd_s_w(vec6, vec6); tmp10 = __msa_hadd_s_w(vec3, vec3); tmp11 = __msa_hadd_s_w(vec7, vec7); vec4 = vec5 * vec0; vec8 = vec5 * vec1; vec5 = vec9 * vec0; vec9 = vec9 * vec1; tmp12 = __msa_hadd_s_w(vec4, vec4); tmp13 = __msa_hadd_s_w(vec8, vec8); tmp14 = __msa_hadd_s_w(vec5, vec5); tmp15 = __msa_hadd_s_w(vec9, vec9); vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8); vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10); vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12); vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14); tmp4 = __msa_hadd_s_w(vec14, vec14); tmp5 = __msa_hadd_s_w(vec15, vec15); tmp6 = __msa_hadd_s_w(vec16, vec16); tmp7 = __msa_hadd_s_w(vec17, vec17); tmp4 = __msa_srai_w(tmp4, 6); tmp5 = __msa_srai_w(tmp5, 6); tmp6 = __msa_srai_w(tmp6, 6); tmp7 = __msa_srai_w(tmp7, 6); vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4); vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6); vec10 = __msa_maxi_s_h(vec10, 0); vec11 = __msa_maxi_s_h(vec11, 0); vec12 = __msa_maxi_s_h(vec12, 0); vec13 = __msa_maxi_s_h(vec13, 0); vec10 = __msa_min_s_h(vec10, max); vec11 = __msa_min_s_h(vec11, max); vec12 = __msa_min_s_h(vec12, max); vec13 = __msa_min_s_h(vec13, max); dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10); dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12); ST_UB2(dst0, dst1, dst_argb, 16); src_argb += 32; dst_argb += 32; } } void SplitUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3; for (x = 0; x < width; x += 32) { src0 = (v16u8)__msa_ld_b((void*)src_uv, 0); src1 = (v16u8)__msa_ld_b((void*)src_uv, 16); src2 = (v16u8)__msa_ld_b((void*)src_uv, 32); src3 = (v16u8)__msa_ld_b((void*)src_uv, 48); dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); ST_UB2(dst0, dst1, dst_u, 16); ST_UB2(dst2, dst3, dst_v, 16); src_uv += 64; dst_u += 32; dst_v += 32; } } void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) { int x; v16u8 dst0 = (v16u8)__msa_fill_b(v8); for (x = 0; x < width; x += 16) { ST_UB(dst0, dst); dst += 16; } } void MirrorSplitUVRow_MSA(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { int x; v16u8 src0, src1, src2, src3; v16u8 dst0, dst1, dst2, dst3; v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0}; v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1}; src_uv += (2 * width); for (x = 0; x < width; x += 32) { src_uv -= 64; src2 = (v16u8)__msa_ld_b((void*)src_uv, 0); src3 = (v16u8)__msa_ld_b((void*)src_uv, 16); src0 = (v16u8)__msa_ld_b((void*)src_uv, 32); src1 = (v16u8)__msa_ld_b((void*)src_uv, 48); dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); ST_UB2(dst0, dst1, dst_v, 16); ST_UB2(dst2, dst3, dst_u, 16); dst_u += 32; dst_v += 32; } } void SobelXRow_MSA(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int32_t width) { int x; v16u8 src0, src1, src2, src3, src4, src5, dst0; v8i16 vec0, vec1, vec2, vec3, vec4, vec5; v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9}; v16i8 tmp = __msa_ldi_b(8); v16i8 mask1 = mask0 + tmp; v8i16 zero = {0}; v8i16 max = __msa_ldi_h(255); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_y0, 0); src1 = (v16u8)__msa_ld_b((void*)src_y0, 16); src2 = (v16u8)__msa_ld_b((void*)src_y1, 0); src3 = (v16u8)__msa_ld_b((void*)src_y1, 16); src4 = (v16u8)__msa_ld_b((void*)src_y2, 0); src5 = (v16u8)__msa_ld_b((void*)src_y2, 16); vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2); vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4); vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0); vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1); vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2); vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3); vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4); vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5); vec0 += vec2; vec1 += vec3; vec4 += vec2; vec5 += vec3; vec0 += vec4; vec1 += vec5; vec0 = __msa_add_a_h(zero, vec0); vec1 = __msa_add_a_h(zero, vec1); vec0 = __msa_maxi_s_h(vec0, 0); vec1 = __msa_maxi_s_h(vec1, 0); vec0 = __msa_min_s_h(max, vec0); vec1 = __msa_min_s_h(max, vec1); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_sobelx); src_y0 += 16; src_y1 += 16; src_y2 += 16; dst_sobelx += 16; } } void SobelYRow_MSA(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int32_t width) { int x; v16u8 src0, src1, dst0; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; v8i16 zero = {0}; v8i16 max = __msa_ldi_h(255); for (x = 0; x < width; x += 16) { src0 = (v16u8)__msa_ld_b((void*)src_y0, 0); src1 = (v16u8)__msa_ld_b((void*)src_y1, 0); vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0); vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0); vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); vec0 -= vec2; vec1 -= vec3; vec6[0] = src_y0[16] - src_y1[16]; vec6[1] = src_y0[17] - src_y1[17]; vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2); vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2); vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4); vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4); vec0 += vec2; vec1 += vec3; vec4 += vec2; vec5 += vec3; vec0 += vec4; vec1 += vec5; vec0 = __msa_add_a_h(zero, vec0); vec1 = __msa_add_a_h(zero, vec1); vec0 = __msa_maxi_s_h(vec0, 0); vec1 = __msa_maxi_s_h(vec1, 0); vec0 = __msa_min_s_h(max, vec0); vec1 = __msa_min_s_h(max, vec1); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst_sobely); src_y0 += 16; src_y1 += 16; dst_sobely += 16; } } void HalfFloatRow_MSA(const uint16_t* src, uint16_t* dst, float scale, int width) { int i; v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3; v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7; v4f32 mult_vec; v8i16 zero = {0}; mult_vec[0] = 1.9259299444e-34f * scale; mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0); for (i = 0; i < width; i += 32) { src0 = (v8u16)__msa_ld_h((void*)src, 0); src1 = (v8u16)__msa_ld_h((void*)src, 16); src2 = (v8u16)__msa_ld_h((void*)src, 32); src3 = (v8u16)__msa_ld_h((void*)src, 48); vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0); vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0); vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1); vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1); vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2); vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2); vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3); vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3); fvec0 = __msa_ffint_u_w(vec0); fvec1 = __msa_ffint_u_w(vec1); fvec2 = __msa_ffint_u_w(vec2); fvec3 = __msa_ffint_u_w(vec3); fvec4 = __msa_ffint_u_w(vec4); fvec5 = __msa_ffint_u_w(vec5); fvec6 = __msa_ffint_u_w(vec6); fvec7 = __msa_ffint_u_w(vec7); fvec0 *= mult_vec; fvec1 *= mult_vec; fvec2 *= mult_vec; fvec3 *= mult_vec; fvec4 *= mult_vec; fvec5 *= mult_vec; fvec6 *= mult_vec; fvec7 *= mult_vec; vec0 = ((v4u32)fvec0) >> 13; vec1 = ((v4u32)fvec1) >> 13; vec2 = ((v4u32)fvec2) >> 13; vec3 = ((v4u32)fvec3) >> 13; vec4 = ((v4u32)fvec4) >> 13; vec5 = ((v4u32)fvec5) >> 13; vec6 = ((v4u32)fvec6) >> 13; vec7 = ((v4u32)fvec7) >> 13; dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2); dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); ST_UH2(dst0, dst1, dst, 8); ST_UH2(dst2, dst3, dst + 16, 8); src += 32; dst += 32; } } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) libyuv-0.0~git20220104.b91df1a/source/row_neon.cc000066400000000000000000005330421416500237200211720ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/row.h" #include #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for GCC Neon #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) // q0: Y uint16x8_t // d2: U uint8x8_t // d3: V uint8x8_t // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ "vld1.8 {d0}, [%[src_y]]! \n" \ "vld1.32 {d2[0]}, [%[src_u]]! \n" \ "vld1.32 {d2[1]}, [%[src_v]]! \n" \ "vmov.u8 d1, d0 \n" \ "vmovl.u8 q1, d2 \n" \ "vzip.u8 d0, d1 \n" \ "vsli.u16 q1, q1, #8 \n" // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ "vld1.8 {d0}, [%[src_y]]! \n" \ "vld1.8 {d2}, [%[src_u]]! \n" \ "vmovl.u8 q0, d0 \n" \ "vld1.8 {d3}, [%[src_v]]! \n" \ "vsli.u16 q0, q0, #8 \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ "vld1.8 {d0}, [%[src_y]]! \n" \ "vmov.u8 q1, #128 \n" \ "vmovl.u8 q0, d0 \n" \ "vsli.u16 q0, q0, #8 \n" // Read 8 Y and 4 UV from NV12 #define READNV12 \ "vld1.8 {d0}, [%[src_y]]! \n" \ "vld1.8 {d2}, [%[src_uv]]! \n" \ "vmov.u8 d1, d0 \n" \ "vmov.u8 d3, d2 \n" \ "vzip.u8 d0, d1 \n" \ "vsli.u16 d2, d2, #8 \n" /* Duplicate low byte (U) */ \ "vsri.u16 d3, d3, #8 \n" /* Duplicate high byte (V) */ // Read 8 Y and 4 VU from NV21 #define READNV21 \ "vld1.8 {d0}, [%[src_y]]! \n" \ "vld1.8 {d2}, [%[src_vu]]! \n" \ "vmov.u8 d1, d0 \n" \ "vmov.u8 d3, d2 \n" \ "vzip.u8 d0, d1 \n" \ "vsri.u16 d2, d2, #8 \n" /* Duplicate high byte (U) */ \ "vsli.u16 d3, d3, #8 \n" /* Duplicate low byte (V) */ // Read 8 YUY2 #define READYUY2 \ "vld2.8 {d0, d2}, [%[src_yuy2]]! \n" \ "vmovl.u8 q0, d0 \n" \ "vmov.u8 d3, d2 \n" \ "vsli.u16 q0, q0, #8 \n" \ "vsli.u16 d2, d2, #8 \n" \ "vsri.u16 d3, d3, #8 \n" // Read 8 UYVY #define READUYVY \ "vld2.8 {d2, d3}, [%[src_uyvy]]! \n" \ "vmovl.u8 q0, d3 \n" \ "vmov.u8 d3, d2 \n" \ "vsli.u16 q0, q0, #8 \n" \ "vsli.u16 d2, d2, #8 \n" \ "vsri.u16 d3, d3, #8 \n" #define YUVTORGB_SETUP \ "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \ "vld1.16 {d31[]}, [%[kRGBCoeffBias]]! \n" \ "vld1.16 {d20[], d21[]}, [%[kRGBCoeffBias]]! \n" \ "vld1.16 {d22[], d23[]}, [%[kRGBCoeffBias]]! \n" \ "vld1.16 {d24[], d25[]}, [%[kRGBCoeffBias]] \n" // q0: B uint16x8_t // q1: G uint16x8_t // q2: R uint16x8_t // Convert from YUV to 2.14 fixed point RGB #define YUVTORGB \ "vmull.u16 q2, d1, d31 \n" \ "vmull.u8 q8, d3, d29 \n" /* DGV */ \ "vmull.u16 q0, d0, d31 \n" \ "vmlal.u8 q8, d2, d28 \n" /* DG */ \ "vqshrn.u32 d0, q0, #16 \n" \ "vqshrn.u32 d1, q2, #16 \n" /* Y */ \ "vmull.u8 q9, d2, d26 \n" /* DB */ \ "vmull.u8 q2, d3, d27 \n" /* DR */ \ "vadd.u16 q4, q0, q11 \n" /* G */ \ "vadd.u16 q2, q0, q2 \n" /* R */ \ "vadd.u16 q0, q0, q9 \n" /* B */ \ "vqsub.u16 q1, q4, q8 \n" /* G */ \ "vqsub.u16 q0, q0, q10 \n" /* B */ \ "vqsub.u16 q2, q2, q12 \n" /* R */ // Convert from 2.14 fixed point RGB To 8 bit RGB #define RGBTORGB8 \ "vqshrn.u16 d4, q2, #6 \n" /* R */ \ "vqshrn.u16 d2, q1, #6 \n" /* G */ \ "vqshrn.u16 d0, q0, #6 \n" /* B */ #define YUVTORGB_REGS \ "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31" #define STORERGBA \ "vmov.u8 d1, d0 \n" \ "vmov.u8 d3, d4 \n" \ "vmov.u8 d0, d6 \n" \ "vst4.8 {d0, d1, d2, d3}, [%[dst_rgba]]! \n" void I444ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV444 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "d6"); } void I444AlphaToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "1: \n" READYUV444 YUVTORGB RGBTORGB8 "vld1.8 {d6}, [%[src_a]]! \n" "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [src_a] "+r"(src_a), // %[src_a] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422AlphaToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB RGBTORGB8 "vld1.8 {d6}, [%[src_a]]! \n" "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [src_a] "+r"(src_a), // %[src_a] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422ToRGBARow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "d6"); } void I422ToRGB24Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS); } #define ARGBTORGB565 \ "vshll.u8 q2, d4, #8 \n" /* R */ \ "vshll.u8 q1, d2, #8 \n" /* G */ \ "vshll.u8 q0, d0, #8 \n" /* B */ \ "vsri.16 q2, q1, #5 \n" /* RG */ \ "vsri.16 q2, q0, #11 \n" /* RGB */ void I422ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565 "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS); } #define ARGBTOARGB1555 \ "vshll.u8 q3, d6, #8 \n" /* A */ \ "vshll.u8 q2, d4, #8 \n" /* R */ \ "vshll.u8 q1, d2, #8 \n" /* G */ \ "vshll.u8 q0, d0, #8 \n" /* B */ \ "vsri.16 q3, q2, #1 \n" /* AR */ \ "vsri.16 q3, q1, #6 \n" /* ARG */ \ "vsri.16 q3, q0, #11 \n" /* ARGB */ void I422ToARGB1555Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vmov.u8 d6, #0xff \n" ARGBTOARGB1555 "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555. "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "q3"); } #define ARGBTOARGB4444 \ "vshr.u8 d0, d0, #4 \n" /* B */ \ "vbic.32 d2, d2, d7 \n" /* G */ \ "vshr.u8 d4, d4, #4 \n" /* R */ \ "vbic.32 d6, d6, d7 \n" /* A */ \ "vorr d0, d0, d2 \n" /* BG */ \ "vorr d1, d4, d6 \n" /* RA */ \ "vzip.u8 d0, d1 \n" /* BGRA */ void I422ToARGB4444Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "vmov.u8 d7, #0x0f \n" // vbic bits to clear "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTOARGB4444 "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "q3"); } void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUV400 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "d6"); } void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d23, #255 \n" "1: \n" "vld1.8 {d20}, [%0]! \n" "vmov d21, d20 \n" "vmov d22, d20 \n" "subs %2, %2, #8 \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "d20", "d21", "d22", "d23"); } void NV12ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "d6"); } void NV21ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV21 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_vu] "+r"(src_vu), // %[src_vu] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "d6"); } void NV12ToRGB24Row_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS); } void NV21ToRGB24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV21 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n" "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_vu] "+r"(src_vu), // %[src_vu] [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS); } void NV12ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565 "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS); } void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READYUY2 YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "d6"); } void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" READUYVY YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n" "bgt 1b \n" : [src_uyvy] "+r"(src_uyvy), // %[src_uyvy] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "d6"); } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop "vst1.8 {q0}, [%1]! \n" // store U "vst1.8 {q1}, [%2]! \n" // store V "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 // Output registers : // Input registers : "cc", "memory", "q0", "q1" // Clobber List ); } // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { asm volatile( "1: \n" "vld1.8 {q0}, [%0]! \n" // load U "vld1.8 {q1}, [%1]! \n" // load V "subs %3, %3, #16 \n" // 16 processed per loop "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV "bgt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 // Output registers : // Input registers : "cc", "memory", "q0", "q1" // Clobber List ); } // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { asm volatile( "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB "subs %4, %4, #16 \n" // 16 processed per loop "vst1.8 {q0}, [%1]! \n" // store R "vst1.8 {q1}, [%2]! \n" // store G "vst1.8 {q2}, [%3]! \n" // store B "bgt 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(width) // %4 : // Input registers : "cc", "memory", "d0", "d1", "d2" // Clobber List ); } // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width) { asm volatile( "1: \n" "vld1.8 {q0}, [%0]! \n" // load R "vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q2}, [%2]! \n" // load B "subs %4, %4, #16 \n" // 16 processed per loop "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_rgb), // %3 "+r"(width) // %4 : // Input registers : "cc", "memory", "q0", "q1", "q2" // Clobber List ); } // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. void SplitARGBRow_NEON(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width) { asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB "subs %5, %5, #16 \n" // 16 processed per loop "vst1.8 {q0}, [%3]! \n" // store B "vst1.8 {q1}, [%2]! \n" // store G "vst1.8 {q2}, [%1]! \n" // store R "vst1.8 {q3}, [%4]! \n" // store A "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(dst_a), // %4 "+r"(width) // %5 : // Input registers : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } // Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, uint8_t* dst_argb, int width) { asm volatile( "1: \n" "vld1.8 {q2}, [%0]! \n" // load R "vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q0}, [%2]! \n" // load B "vld1.8 {q3}, [%3]! \n" // load A "subs %5, %5, #16 \n" // 16 processed per loop "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(src_a), // %3 "+r"(dst_argb), // %4 "+r"(width) // %5 : // Input registers : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. void SplitXRGBRow_NEON(const uint8_t* src_argb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB "subs %4, %4, #16 \n" // 16 processed per loop "vst1.8 {q0}, [%3]! \n" // store B "vst1.8 {q1}, [%2]! \n" // store G "vst1.8 {q2}, [%1]! \n" // store R "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(width) // %4 : // Input registers : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time void MergeXRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 q3, #255 \n" // load A(255) "1: \n" "vld1.8 {q2}, [%0]! \n" // load R "vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q0}, [%2]! \n" // load B "subs %4, %4, #16 \n" // 16 processed per loop "vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB "vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 : // Input registers : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } void MergeXR30Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_ar30, int depth, int width) { int shift = 10 - depth; asm volatile( "vmov.u32 q14, #1023 \n" "vdup.32 q15, %5 \n" "1: \n" "vld1.16 {d4}, [%2]! \n" // B "vld1.16 {d2}, [%1]! \n" // G "vld1.16 {d0}, [%0]! \n" // R "vmovl.u16 q2, d4 \n" // B "vmovl.u16 q1, d2 \n" // G "vmovl.u16 q0, d0 \n" // R "vshl.u32 q2, q2, q15 \n" // 000B "vshl.u32 q1, q1, q15 \n" "vshl.u32 q0, q0, q15 \n" "vmin.u32 q2, q2, q14 \n" "vmin.u32 q1, q1, q14 \n" "vmin.u32 q0, q0, q14 \n" "vsli.u32 q2, q1, #10 \n" // 00GB "vsli.u32 q2, q0, #20 \n" // 0RGB "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) "subs %4, %4, #4 \n" "vst1.8 {q2}, [%3]! \n" "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_ar30), // %3 "+r"(width) // %4 : "r"(shift) // %5 : "memory", "cc", "q0", "q1", "q2", "q14", "q15"); } void MergeXR30Row_10_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_ar30, int /* depth */, int width) { asm volatile( "vmov.u32 q14, #1023 \n" "1: \n" "vld1.16 {d4}, [%2]! \n" // B "vld1.16 {d2}, [%1]! \n" // G "vld1.16 {d0}, [%0]! \n" // R "vmovl.u16 q2, d4 \n" // 000B "vmovl.u16 q1, d2 \n" // G "vmovl.u16 q0, d0 \n" // R "vmin.u32 q2, q2, q14 \n" "vmin.u32 q1, q1, q14 \n" "vmin.u32 q0, q0, q14 \n" "vsli.u32 q2, q1, #10 \n" // 00GB "vsli.u32 q2, q0, #20 \n" // 0RGB "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30) "subs %4, %4, #4 \n" "vst1.8 {q2}, [%3]! \n" "bgt 1b \n" "3: \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_ar30), // %3 "+r"(width) // %4 : : "memory", "cc", "q0", "q1", "q2", "q14"); } void MergeAR64Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint16_t* dst_ar64, int depth, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; asm volatile( "vdup.u16 q15, %6 \n" "vdup.u16 q14, %7 \n" "1: \n" "vld1.16 {q2}, [%0]! \n" // R "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B "vld1.16 {q3}, [%3]! \n" // A "vmin.u16 q2, q2, q14 \n" "vmin.u16 q1, q1, q14 \n" "vmin.u16 q0, q0, q14 \n" "vmin.u16 q3, q3, q14 \n" "vshl.u16 q2, q2, q15 \n" "vshl.u16 q1, q1, q15 \n" "vshl.u16 q0, q0, q15 \n" "vshl.u16 q3, q3, q15 \n" "subs %5, %5, #8 \n" "vst4.16 {d0, d2, d4, d6}, [%4]! \n" "vst4.16 {d1, d3, d5, d7}, [%4]! \n" "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(src_a), // %3 "+r"(dst_ar64), // %4 "+r"(width) // %5 : "r"(shift), // %6 "r"(mask) // %7 : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); } void MergeXR64Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint16_t* dst_ar64, int depth, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; asm volatile( "vmov.u8 q3, #0xff \n" // A (0xffff) "vdup.u16 q15, %5 \n" "vdup.u16 q14, %6 \n" "1: \n" "vld1.16 {q2}, [%0]! \n" // R "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B "vmin.u16 q2, q2, q14 \n" "vmin.u16 q1, q1, q14 \n" "vmin.u16 q0, q0, q14 \n" "vshl.u16 q2, q2, q15 \n" "vshl.u16 q1, q1, q15 \n" "vshl.u16 q0, q0, q15 \n" "subs %4, %4, #8 \n" "vst4.16 {d0, d2, d4, d6}, [%3]! \n" "vst4.16 {d1, d3, d5, d7}, [%3]! \n" "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_ar64), // %3 "+r"(width) // %4 : "r"(shift), // %5 "r"(mask) // %6 : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); } void MergeARGB16To8Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint8_t* dst_argb, int depth, int width) { int shift = 8 - depth; asm volatile( "vdup.16 q15, %6 \n" "1: \n" "vld1.16 {q2}, [%0]! \n" // R "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B "vld1.16 {q3}, [%3]! \n" // A "vshl.u16 q2, q2, q15 \n" "vshl.u16 q1, q1, q15 \n" "vshl.u16 q0, q0, q15 \n" "vshl.u16 q3, q3, q15 \n" "vqmovn.u16 d0, q0 \n" "vqmovn.u16 d1, q1 \n" "vqmovn.u16 d2, q2 \n" "vqmovn.u16 d3, q3 \n" "subs %5, %5, #8 \n" "vst4.8 {d0, d1, d2, d3}, [%4]! \n" "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(src_a), // %3 "+r"(dst_argb), // %4 "+r"(width) // %5 : "r"(shift) // %6 : "memory", "cc", "q0", "q1", "q2", "q3", "q15"); } void MergeXRGB16To8Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_argb, int depth, int width) { int shift = 8 - depth; asm volatile( "vdup.16 q15, %5 \n" "vmov.u8 d6, #0xff \n" // A (0xff) "1: \n" "vld1.16 {q2}, [%0]! \n" // R "vld1.16 {q1}, [%1]! \n" // G "vld1.16 {q0}, [%2]! \n" // B "vshl.u16 q2, q2, q15 \n" "vshl.u16 q1, q1, q15 \n" "vshl.u16 q0, q0, q15 \n" "vqmovn.u16 d5, q2 \n" "vqmovn.u16 d4, q1 \n" "vqmovn.u16 d3, q0 \n" "subs %4, %4, #8 \n" "vst4.u8 {d3, d4, d5, d6}, [%3]! \n" "bgt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 : "r"(shift) // %5 : "memory", "cc", "q0", "q1", "q2", "d6", "q15"); } // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "1: \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 // Output registers : // Input registers : "cc", "memory", "q0", "q1" // Clobber List ); } // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { asm volatile( "vdup.8 q0, %2 \n" // duplicate 16 bytes "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop "vst1.8 {q0}, [%0]! \n" // store "bgt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v8) // %2 : "cc", "memory", "q0"); } // ARGBSetRow writes 'width' pixels using an 32 bit value repeated. void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { asm volatile( "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" "subs %1, %1, #4 \n" // 4 pixels per loop "vst1.8 {q0}, [%0]! \n" // store "bgt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v32) // %2 : "cc", "memory", "q0"); } void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( // Start at end of source row. "add %0, %0, %2 \n" "sub %0, %0, #32 \n" // 32 bytes per loop "1: \n" "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32 "subs %2, #32 \n" // 32 pixels per loop. "vrev64.8 q0, q2 \n" "vrev64.8 q1, q1 \n" "vswp d0, d1 \n" "vswp d2, d3 \n" "vst1.8 {q0, q1}, [%1]! \n" // dst += 32 "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "r"(-32) // %3 : "cc", "memory", "q0", "q1", "q2"); } void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { asm volatile( // Start at end of source row. "mov r12, #-16 \n" "add %0, %0, %2, lsl #1 \n" "sub %0, #16 \n" "1: \n" "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "subs %2, #8 \n" // 8 pixels per loop. "vrev64.8 q0, q0 \n" "vst2.8 {d0, d1}, [%1]! \n" // dst += 16 "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(width) // %2 : : "cc", "memory", "r12", "q0"); } void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( // Start at end of source row. "mov r12, #-16 \n" "add %0, %0, %3, lsl #1 \n" "sub %0, #16 \n" "1: \n" "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "subs %3, #8 \n" // 8 pixels per loop. "vrev64.8 q0, q0 \n" "vst1.8 {d0}, [%1]! \n" // dst += 8 "vst1.8 {d1}, [%2]! \n" "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "cc", "memory", "r12", "q0"); } void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( "add %0, %0, %2, lsl #2 \n" "sub %0, #32 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32 "subs %2, #8 \n" // 8 pixels per loop. "vrev64.8 d0, d0 \n" "vrev64.8 d1, d1 \n" "vrev64.8 d2, d2 \n" "vrev64.8 d3, d3 \n" "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32 "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(-32) // %3 : "cc", "memory", "d0", "d1", "d2", "d3"); } void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { src_rgb24 += width * 3 - 24; asm volatile( "1: \n" "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 "subs %2, #8 \n" // 8 pixels per loop. "vrev64.8 d0, d0 \n" "vrev64.8 d1, d1 \n" "vrev64.8 d2, d2 \n" "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24 "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 : "r"(-24) // %3 : "cc", "memory", "d0", "d1", "d2"); } void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d4, #255 \n" // Alpha "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List ); } void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d4, #255 \n" // Alpha "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List ); } void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( "vmov.u8 d0, #255 \n" // Alpha "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA. "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgba), // %1 "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List ); } void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of // RGB24. "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3" // Clobber List ); } #define RGB565TOARGB \ "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ "vorr.u8 d0, d0, d4 \n" /* B */ \ "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ "vorr.u8 d2, d1, d5 \n" /* R */ \ "vorr.u8 d1, d4, d6 \n" /* G */ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d3, #255 \n" // Alpha "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } #define ARGB1555TOARGB \ "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ "vorr.u8 q1, q1, q3 \n" /* R,A */ \ "vorr.u8 q0, q0, q2 \n" /* B,G */ // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. #define RGB555TOARGB \ "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ "vorr.u8 d0, d0, d4 \n" /* B */ \ "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ "vorr.u8 d2, d1, d5 \n" /* R */ \ "vorr.u8 d1, d4, d6 \n" /* G */ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d3, #255 \n" // Alpha "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } #define ARGB4444TOARGB \ "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d3, #255 \n" // Alpha "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2" // Clobber List ); } void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop. "vst3.8 {d0, d2, d4}, [%1]! \n" // store 16 RGB24 pixels. "vst3.8 {d1, d3, d5}, [%1]! \n" "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(width) // %2 : : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List ); } void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1" // Clobber List ); } void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1" // Clobber List ); } void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "vst1.8 {d1}, [%1]! \n" // store 8 U. "vst1.8 {d3}, [%2]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List ); } void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "vst1.8 {d0}, [%1]! \n" // store 8 U. "vst1.8 {d2}, [%2]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List ); } void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "add %1, %0, %1 \n" // stride + src_yuy2 "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "vrhadd.u8 d1, d1, d5 \n" // average rows of U "vrhadd.u8 d3, d3, d7 \n" // average rows of V "vst1.8 {d1}, [%2]! \n" // store 8 U. "vst1.8 {d3}, [%3]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(stride_yuy2), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List ); } void UYVYToUVRow_NEON(const uint8_t* src_uyvy, int stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "add %1, %0, %1 \n" // stride + src_uyvy "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "vrhadd.u8 d0, d0, d4 \n" // average rows of U "vrhadd.u8 d2, d2, d6 \n" // average rows of V "vst1.8 {d0}, [%2]! \n" // store 8 U. "vst1.8 {d2}, [%3]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(stride_uyvy), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List ); } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { asm volatile( "vld1.8 {q2}, [%3] \n" // shuffler "1: \n" "vld1.8 {q0}, [%0]! \n" // load 4 pixels. "subs %2, %2, #4 \n" // 4 processed per loop "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels "vst1.8 {q1}, [%1]! \n" // store 4. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(shuffler) // %3 : "cc", "memory", "q0", "q1", "q2" // Clobber List ); } void I422ToYUY2Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { asm volatile( "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "vld1.8 {d1}, [%1]! \n" // load 8 Us "vld1.8 {d3}, [%2]! \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_yuy2), // %3 "+r"(width) // %4 : : "cc", "memory", "d0", "d1", "d2", "d3"); } void I422ToUYVYRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { asm volatile( "1: \n" "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "vld1.8 {d0}, [%1]! \n" // load 8 Us "vld1.8 {d2}, [%2]! \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_uyvy), // %3 "+r"(width) // %4 : : "cc", "memory", "d0", "d1", "d2", "d3"); } void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb565, int width) { asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 "vst1.8 {q2}, [%1]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "d6"); } void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width) { asm volatile( "vdup.32 d7, %2 \n" // dither4 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 d0, d0, d7 \n" "vqadd.u8 d2, d2, d7 \n" "vqadd.u8 d4, d4, d7 \n" // add for dither ARGBTORGB565 "vst1.8 {q2}, [%0]! \n" // store 8 RGB565. "bgt 1b \n" : "+r"(dst_rgb) // %0 : "r"(src_argb), // %1 "r"(dither4), // %2 "r"(width) // %3 : "cc", "memory", "q0", "q1", "q2", "q3"); } void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb1555, int width) { asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 "vst1.8 {q3}, [%1]! \n" // store 8 ARGB1555. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3"); } void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { asm volatile( "vmov.u8 d7, #0x0f \n" // bits to clear with // vbic. "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3"); } void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) { asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels "subs %2, %2, #16 \n" // 16 processed per loop "vst1.8 {q3}, [%1]! \n" // store 16 A's. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d1, d24 \n" // B "vmlal.u8 q2, d2, d25 \n" // G "vmlal.u8 q2, d3, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } // 8x1 pixels. void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "vmov.u8 d24, #112 \n" // UB / VR 0.875 // coefficient "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B "vmlsl.u8 q2, d1, d25 \n" // G "vmlsl.u8 q2, d2, d26 \n" // R "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned "vmull.u8 q3, d2, d24 \n" // R "vmlsl.u8 q3, d1, d28 \n" // G "vmlsl.u8 q3, d0, d27 \n" // B "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"); } // clang-format off // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ // clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. "vrshr.u16 q0, q0, #1 \n" // 2x average "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride_argb), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } // TODO(fbarchard): Subsample match C code. void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. "vrshr.u16 q0, q0, #1 \n" // 2x average "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride_argb), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } // TODO(fbarchard): Subsample match C code. void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgb24 "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. "vrshr.u16 q0, q0, #1 \n" // 2x average "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(src_stride_rgb24), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } // TODO(fbarchard): Subsample match C code. void RAWToUVJRow_NEON(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_raw "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. "vrshr.u16 q0, q0, #1 \n" // 2x average "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(src_stride_raw), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_bgra "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. "vrshr.u16 q1, q1, #1 \n" // 2x average "vrshr.u16 q2, q2, #1 \n" "vrshr.u16 q3, q3, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q3, q2, q1) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(src_stride_bgra), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } void ABGRToUVRow_NEON(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_abgr "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. "vrshr.u16 q0, q0, #1 \n" // 2x average "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_abgr), // %0 "+r"(src_stride_abgr), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } void RGBAToUVRow_NEON(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgba "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. "vrshr.u16 q0, q0, #1 \n" // 2x average "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgba), // %0 "+r"(src_stride_rgba), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgb24 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. "vrshr.u16 q0, q0, #1 \n" // 2x average "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(src_stride_rgb24), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } void RAWToUVRow_NEON(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_raw "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. "vrshr.u16 q0, q0, #1 \n" // 2x average "vrshr.u16 q1, q1, #1 \n" "vrshr.u16 q2, q2, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q2, q1, q0) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(src_stride_raw), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. RGB565TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. RGB565TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. RGB565TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. RGB565TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. "vrshr.u16 q4, q4, #1 \n" // 2x average "vrshr.u16 q5, q5, #1 \n" "vrshr.u16 q6, q6, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. "vmul.s16 q8, q4, q10 \n" // B "vmls.s16 q8, q5, q11 \n" // G "vmls.s16 q8, q6, q12 \n" // R "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned "vmul.s16 q9, q6, q10 \n" // R "vmls.s16 q9, q5, q14 \n" // G "vmls.s16 q9, q4, q13 \n" // B "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(src_stride_rgb565), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. "vrshr.u16 q4, q4, #1 \n" // 2x average "vrshr.u16 q5, q5, #1 \n" "vrshr.u16 q6, q6, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. "vmul.s16 q8, q4, q10 \n" // B "vmls.s16 q8, q5, q11 \n" // G "vmls.s16 q8, q6, q12 \n" // R "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned "vmul.s16 q9, q6, q10 \n" // R "vmls.s16 q9, q5, q14 \n" // G "vmls.s16 q9, q4, q13 \n" // B "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(src_stride_argb1555), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "add %1, %0, %1 \n" // src_stride + src_argb "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 // coefficient "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. "vrshr.u16 q0, q4, #1 \n" // 2x average "vrshr.u16 q1, q5, #1 \n" "vrshr.u16 q2, q6, #1 \n" "subs %4, %4, #16 \n" // 16 processed per loop. RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(src_stride_argb4444), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d27, #16 \n" // Add 16 constant "1: \n" "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { asm volatile( "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q2}, [%0]! \n" "vmov.u8 q1, q0 \n" "vmov.u8 q3, q2 \n" "subs %2, %2, #8 \n" // 8 processed per loop. "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_ar64), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3"); } static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { asm volatile( "vld1.8 {q4}, [%3] \n" // shuffler "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q2}, [%0]! \n" "vtbl.8 d2, {d0, d1}, d8 \n" "vtbl.8 d3, {d0, d1}, d9 \n" "vtbl.8 d6, {d4, d5}, d8 \n" "vtbl.8 d7, {d4, d5}, d9 \n" "vmov.u8 q0, q1 \n" "vmov.u8 q2, q3 \n" "subs %2, %2, #8 \n" // 8 processed per loop. "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_ab64), // %1 "+r"(width) // %2 : "r"(&kShuffleARGBToABGR) // %3 : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); } void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { asm volatile( "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" "vld1.16 {q2}, [%0]! \n" "vld1.16 {q3}, [%0]! \n" "vshrn.u16 d0, q0, #8 \n" "vshrn.u16 d1, q1, #8 \n" "vshrn.u16 d4, q2, #8 \n" "vshrn.u16 d5, q3, #8 \n" "subs %2, %2, #8 \n" // 8 processed per loop. "vst1.8 {q0}, [%1]! \n" // store 4 pixels "vst1.8 {q2}, [%1]! \n" // store 4 pixels "bgt 1b \n" : "+r"(src_ar64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3"); } static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15}; void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { asm volatile( "vld1.8 {d8}, [%3] \n" // shuffler "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" "vld1.16 {q2}, [%0]! \n" "vld1.16 {q3}, [%0]! \n" "vtbl.8 d0, {d0, d1}, d8 \n" "vtbl.8 d1, {d2, d3}, d8 \n" "vtbl.8 d4, {d4, d5}, d8 \n" "vtbl.8 d5, {d6, d7}, d8 \n" "subs %2, %2, #8 \n" // 8 processed per loop. "vst1.8 {q0}, [%1]! \n" // store 4 pixels "vst1.8 {q2}, [%1]! \n" // store 4 pixels "bgt 1b \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(&kShuffleAB64ToARGB) // %3 : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); } void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d1, d4 \n" // R "vmlal.u8 q8, d2, d5 \n" // G "vmlal.u8 q8, d3, d6 \n" // B "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // R "vmlal.u8 q8, d1, d5 \n" // G "vmlal.u8 q8, d2, d6 \n" // B "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d1, d4 \n" // B "vmlal.u8 q8, d2, d5 \n" // G "vmlal.u8 q8, d3, d6 \n" // R "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d4, #25 \n" // B * 0.1016 coefficient "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient "vmov.u8 d6, #66 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // B "vmlal.u8 q8, d1, d5 \n" // G "vmlal.u8 q8, d2, d6 \n" // R "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient "vmov.u8 d5, #129 \n" // G * 0.5078 coefficient "vmov.u8 d4, #66 \n" // R * 0.2578 coefficient "vmov.u8 d7, #16 \n" // Add 16 constant "1: \n" "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // B "vmlal.u8 q8, d1, d5 \n" // G "vmlal.u8 q8, d2, d6 \n" // R "vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { asm volatile( "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient "vmov.u8 d6, #77 \n" // R * 0.2990 coefficient "1: \n" "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q4, d0, d4 \n" // B "vmlal.u8 q4, d1, d5 \n" // G "vmlal.u8 q4, d2, d6 \n" // R "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_yj), // %1 "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4"); } void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { asm volatile( "vmov.u8 d6, #29 \n" // B * 0.1140 coefficient "vmov.u8 d5, #150 \n" // G * 0.5870 coefficient "vmov.u8 d4, #77 \n" // R * 0.2990 coefficient "1: \n" "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q4, d0, d4 \n" // R "vmlal.u8 q4, d1, d5 \n" // G "vmlal.u8 q4, d2, d6 \n" // B "vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_yj), // %1 "+r"(width) // %2 : : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4"); } // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { int y1_fraction = source_y_fraction; asm volatile( "cmp %4, #0 \n" "beq 100f \n" "add %2, %1 \n" "cmp %4, #128 \n" "beq 50f \n" "vdup.8 d5, %4 \n" "rsb %4, #256 \n" "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" "vld1.8 {q0}, [%1]! \n" "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vmull.u8 q13, d0, d4 \n" "vmull.u8 q14, d1, d4 \n" "vmlal.u8 q13, d2, d5 \n" "vmlal.u8 q14, d3, d5 \n" "vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d1, q14, #8 \n" "vst1.8 {q0}, [%0]! \n" "bgt 1b \n" "b 99f \n" // Blend 50 / 50. "50: \n" "vld1.8 {q0}, [%1]! \n" "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vst1.8 {q0}, [%0]! \n" "bgt 50b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" "vld1.8 {q0}, [%1]! \n" "subs %3, %3, #16 \n" "vst1.8 {q0}, [%0]! \n" "bgt 100b \n" "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 "+r"(dst_width), // %3 "+r"(y1_fraction) // %4 : : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"); } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( "subs %3, #8 \n" "blt 89f \n" // Blend 8 pixels. "8: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q10, d4, d3 \n" // db * a "vmull.u8 q11, d5, d3 \n" // dg * a "vmull.u8 q12, d6, d3 \n" // dr * a "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 "vqadd.u8 q0, q0, q2 \n" // + sbg "vqadd.u8 d2, d2, d6 \n" // + sr "vmov.u8 d3, #255 \n" // a = 255 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. "bge 8b \n" "89: \n" "adds %3, #8-1 \n" "blt 99f \n" // Blend 1 pixels. "1: \n" "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. "subs %3, %3, #1 \n" // 1 processed per loop. "vmull.u8 q10, d4, d3 \n" // db * a "vmull.u8 q11, d5, d3 \n" // dg * a "vmull.u8 q12, d6, d3 \n" // dr * a "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 "vqadd.u8 q0, q0, q2 \n" // + sbg "vqadd.u8 d2, d2, d6 \n" // + sr "vmov.u8 d3, #255 \n" // a = 255 "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. "bge 1b \n" "99: \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"); } // Attenuate 8 pixels at a time. void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( // Attenuate 8 pixels. "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q10, d0, d3 \n" // b * a "vmull.u8 q11, d1, d3 \n" // g * a "vmull.u8 q12, d2, d3 \n" // r * a "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q10", "q11", "q12"); } // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width) { asm volatile( "vdup.u16 q8, %2 \n" "vshr.u16 q8, q8, #1 \n" // scale >>= 1 "vdup.u16 q9, %3 \n" // interval multiply. "vdup.u16 q10, %4 \n" // interval add // 8 pixel loop. "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. "subs %1, %1, #8 \n" // 8 processed per loop. "vmovl.u8 q0, d0 \n" // b (0 .. 255) "vmovl.u8 q1, d2 \n" "vmovl.u8 q2, d4 \n" "vqdmulh.s16 q0, q0, q8 \n" // b * scale "vqdmulh.s16 q1, q1, q8 \n" // g "vqdmulh.s16 q2, q2, q8 \n" // r "vmul.u16 q0, q0, q9 \n" // b * interval_size "vmul.u16 q1, q1, q9 \n" // g "vmul.u16 q2, q2, q9 \n" // r "vadd.u16 q0, q0, q10 \n" // b + interval_offset "vadd.u16 q1, q1, q10 \n" // g "vadd.u16 q2, q2, q10 \n" // r "vqmovn.u16 d0, q0 \n" "vqmovn.u16 d2, q1 \n" "vqmovn.u16 d4, q2 \n" "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 "r"(interval_size), // %3 "r"(interval_offset) // %4 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"); } // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. void ARGBShadeRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { asm volatile( "vdup.u32 q0, %3 \n" // duplicate scale value. "vzip.u8 d0, d1 \n" // d0 aarrggbb. "vshr.u16 q0, q0, #1 \n" // scale / 2. // 8 pixel loop. "1: \n" "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q10, d20 \n" // b (0 .. 255) "vmovl.u8 q11, d22 \n" "vmovl.u8 q12, d24 \n" "vmovl.u8 q13, d26 \n" "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 "vqrdmulh.s16 q11, q11, d0[1] \n" // g "vqrdmulh.s16 q12, q12, d0[2] \n" // r "vqrdmulh.s16 q13, q13, d0[3] \n" // a "vqmovn.u16 d20, q10 \n" "vqmovn.u16 d22, q11 \n" "vqmovn.u16 d24, q12 \n" "vqmovn.u16 d26, q13 \n" "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(value) // %3 : "cc", "memory", "q0", "q10", "q11", "q12", "q13"); } // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. // C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B "vmov d1, d0 \n" // G "vmov d2, d0 \n" // R "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q12", "q13"); } // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. // b = (r * 35 + g * 68 + b * 17) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d20, #17 \n" // BB coefficient "vmov.u8 d21, #68 \n" // BG coefficient "vmov.u8 d22, #35 \n" // BR coefficient "vmov.u8 d24, #22 \n" // GB coefficient "vmov.u8 d25, #88 \n" // GG coefficient "vmov.u8 d26, #45 \n" // GR coefficient "vmov.u8 d28, #24 \n" // BB coefficient "vmov.u8 d29, #98 \n" // BG coefficient "vmov.u8 d30, #50 \n" // BR coefficient "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. "subs %1, %1, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d20 \n" // B to Sepia B "vmlal.u8 q2, d1, d21 \n" // G "vmlal.u8 q2, d2, d22 \n" // R "vmull.u8 q3, d0, d24 \n" // B to Sepia G "vmlal.u8 q3, d1, d25 \n" // G "vmlal.u8 q3, d2, d26 \n" // R "vmull.u8 q8, d0, d28 \n" // B to Sepia R "vmlal.u8 q8, d1, d29 \n" // G "vmlal.u8 q8, d2, d30 \n" // R "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13", "q14", "q15"); } // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { asm volatile( "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16. "1: \n" "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit "vmovl.u8 q9, d18 \n" // g "vmovl.u8 q10, d20 \n" // r "vmovl.u8 q11, d22 \n" // a "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A "vqadd.s16 q12, q12, q4 \n" // Accumulate B "vqadd.s16 q13, q13, q5 \n" // Accumulate G "vqadd.s16 q14, q14, q6 \n" // Accumulate R "vqadd.s16 q15, q15, q7 \n" // Accumulate A "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A "vqadd.s16 q12, q12, q4 \n" // Accumulate B "vqadd.s16 q13, q13, q5 \n" // Accumulate G "vqadd.s16 q14, q14, q6 \n" // Accumulate R "vqadd.s16 q15, q15, q7 \n" // Accumulate A "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A "vqadd.s16 q12, q12, q4 \n" // Accumulate B "vqadd.s16 q13, q13, q5 \n" // Accumulate G "vqadd.s16 q14, q14, q6 \n" // Accumulate R "vqadd.s16 q15, q15, q7 \n" // Accumulate A "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(matrix_argb) // %3 : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q0, d0, d1 \n" // multiply B "vmull.u8 q1, d2, d3 \n" // multiply G "vmull.u8 q2, d4, d5 \n" // multiply R "vmull.u8 q3, d6, d7 \n" // multiply A "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2", "q3"); } // Add 2 rows of ARGB pixels together, 8 pixels at a time. void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 q0, q0, q2 \n" // add B, G "vqadd.u8 q1, q1, q3 \n" // add R, A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2", "q3"); } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB "subs %3, %3, #8 \n" // 8 processed per loop. "vqsub.u8 q0, q0, q2 \n" // subtract B, G "vqsub.u8 q1, q1, q3 \n" // subtract R, A "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2", "q3"); } // Adds Sobel X and Sobel Y and stores Sobel into ARGB. // A = 255 // R = Sobel // G = Sobel // B = Sobel void SobelRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. "vld1.8 {d1}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 d0, d0, d1 \n" // add "vmov.u8 d1, d0 \n" "vmov.u8 d2, d0 \n" "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1"); } // Adds Sobel X and Sobel Y and stores Sobel into plane. void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { asm volatile( // 16 pixel loop. "1: \n" "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. "vld1.8 {q1}, [%1]! \n" // load 16 sobely. "subs %3, %3, #16 \n" // 16 processed per loop. "vqadd.u8 q0, q0, q1 \n" // add "vst1.8 {q0}, [%2]! \n" // store 16 pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1"); } // Mixes Sobel X, Sobel Y and Sobel into ARGB. // A = 255 // R = Sobel X // G = Sobel // B = Sobel Y void SobelXYRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d3, #255 \n" // alpha // 8 pixel loop. "1: \n" "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. "vld1.8 {d0}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 d1, d0, d2 \n" // add "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1"); } // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 void SobelXRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { asm volatile( "1: \n" "vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d1}, [%0],%6 \n" "vsubl.u8 q0, d0, d1 \n" "vld1.8 {d2}, [%1],%5 \n" // center * 2 "vld1.8 {d3}, [%1],%6 \n" "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n" "vld1.8 {d2}, [%2],%5 \n" // bottom "vld1.8 {d3}, [%2],%6 \n" "subs %4, %4, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" "vqmovn.u16 d0, q0 \n" "vst1.8 {d0}, [%3]! \n" // store 8 sobelx "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 "+r"(dst_sobelx), // %3 "+r"(width) // %4 : "r"(2), // %5 "r"(6) // %6 : "cc", "memory", "q0", "q1" // Clobber List ); } // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 void SobelYRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { asm volatile( "1: \n" "vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d1}, [%1],%4 \n" "vsubl.u8 q0, d0, d1 \n" "vld1.8 {d2}, [%0],%4 \n" // center * 2 "vld1.8 {d3}, [%1],%4 \n" "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n" "vld1.8 {d2}, [%0],%5 \n" // right "vld1.8 {d3}, [%1],%5 \n" "subs %3, %3, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" "vqmovn.u16 d0, q0 \n" "vst1.8 {d0}, [%2]! \n" // store 8 sobely "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 "+r"(width) // %3 : "r"(1), // %4 "r"(6) // %5 : "cc", "memory", "q0", "q1" // Clobber List ); } // %y passes a float as a scalar vector for vector * scalar multiply. // the regoster must be d0 to d15 and indexed with [0] or [1] to access // the float in the first or second float of the d-reg void HalfFloat1Row_NEON(const uint16_t* src, uint16_t* dst, float /*unused*/, int width) { asm volatile( "1: \n" "vld1.8 {q1}, [%0]! \n" // load 8 shorts "subs %2, %2, #8 \n" // 8 pixels per loop "vmovl.u16 q2, d2 \n" // 8 int's "vmovl.u16 q3, d3 \n" "vcvt.f32.u32 q2, q2 \n" // 8 floats "vcvt.f32.u32 q3, q3 \n" "vmul.f32 q2, q2, %y3 \n" // adjust exponent "vmul.f32 q3, q3, %y3 \n" "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat "vqshrn.u32 d3, q3, #13 \n" "vst1.8 {q1}, [%1]! \n" "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "w"(1.9259299444e-34f) // %3 : "cc", "memory", "q1", "q2", "q3"); } void HalfFloatRow_NEON(const uint16_t* src, uint16_t* dst, float scale, int width) { asm volatile( "1: \n" "vld1.8 {q1}, [%0]! \n" // load 8 shorts "subs %2, %2, #8 \n" // 8 pixels per loop "vmovl.u16 q2, d2 \n" // 8 int's "vmovl.u16 q3, d3 \n" "vcvt.f32.u32 q2, q2 \n" // 8 floats "vcvt.f32.u32 q3, q3 \n" "vmul.f32 q2, q2, %y3 \n" // adjust exponent "vmul.f32 q3, q3, %y3 \n" "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat "vqshrn.u32 d3, q3, #13 \n" "vst1.8 {q1}, [%1]! \n" "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "w"(scale * 1.9259299444e-34f) // %3 : "cc", "memory", "q1", "q2", "q3"); } void ByteToFloatRow_NEON(const uint8_t* src, float* dst, float scale, int width) { asm volatile( "1: \n" "vld1.8 {d2}, [%0]! \n" // load 8 bytes "subs %2, %2, #8 \n" // 8 pixels per loop "vmovl.u8 q1, d2 \n" // 8 shorts "vmovl.u16 q2, d2 \n" // 8 ints "vmovl.u16 q3, d3 \n" "vcvt.f32.u32 q2, q2 \n" // 8 floats "vcvt.f32.u32 q3, q3 \n" "vmul.f32 q2, q2, %y3 \n" // scale "vmul.f32 q3, q3, %y3 \n" "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "w"(scale) // %3 : "cc", "memory", "q1", "q2", "q3"); } // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussCol_NEON(const uint16_t* src0, const uint16_t* src1, const uint16_t* src2, const uint16_t* src3, const uint16_t* src4, uint32_t* dst, int width) { asm volatile( "vmov.u16 d6, #4 \n" // constant 4 "vmov.u16 d7, #6 \n" // constant 6 "1: \n" "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows "vld1.16 {q2}, [%4]! \n" "vaddl.u16 q0, d2, d4 \n" // * 1 "vaddl.u16 q1, d3, d5 \n" // * 1 "vld1.16 {q2}, [%1]! \n" "vmlal.u16 q0, d4, d6 \n" // * 4 "vmlal.u16 q1, d5, d6 \n" // * 4 "vld1.16 {q2}, [%2]! \n" "vmlal.u16 q0, d4, d7 \n" // * 6 "vmlal.u16 q1, d5, d7 \n" // * 6 "vld1.16 {q2}, [%3]! \n" "vmlal.u16 q0, d4, d6 \n" // * 4 "vmlal.u16 q1, d5, d6 \n" // * 4 "subs %6, %6, #8 \n" // 8 processed per loop "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples "bgt 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(src2), // %2 "+r"(src3), // %3 "+r"(src4), // %4 "+r"(dst), // %5 "+r"(width) // %6 : : "cc", "memory", "q0", "q1", "q2", "q3"); } // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src1 = src + 1; const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; asm volatile( "vmov.u32 q10, #4 \n" // constant 4 "vmov.u32 q11, #6 \n" // constant 6 "1: \n" "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples "vld1.32 {q2}, [%0] \n" "vadd.u32 q0, q0, q1 \n" // * 1 "vadd.u32 q1, q1, q2 \n" // * 1 "vld1.32 {q2, q3}, [%2]! \n" "vmla.u32 q0, q2, q11 \n" // * 6 "vmla.u32 q1, q3, q11 \n" // * 6 "vld1.32 {q2, q3}, [%1]! \n" "vld1.32 {q8, q9}, [%3]! \n" "vadd.u32 q2, q2, q8 \n" // add rows for * 4 "vadd.u32 q3, q3, q9 \n" "vmla.u32 q0, q2, q10 \n" // * 4 "vmla.u32 q1, q3, q10 \n" // * 4 "subs %5, %5, #8 \n" // 8 processed per loop "vqshrn.u32 d0, q0, #8 \n" // round and pack "vqshrn.u32 d1, q1, #8 \n" "vst1.u16 {q0}, [%4]! \n" // store 8 samples "bgt 1b \n" : "+r"(src), // %0 "+r"(src1), // %1 "+r"(src2), // %2 "+r"(src3), // %3 "+r"(dst), // %4 "+r"(width) // %5 : : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { asm volatile( "1: \n" "vld1.8 {q2}, [%0]! \n" // load 16 Y values "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values "vmov d1, d0 \n" "vzip.u8 d0, d1 \n" // VV "vmov d3, d2 \n" "vzip.u8 d2, d3 \n" // UU "subs %3, %3, #16 \n" // 16 pixels per loop "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels "vst3.8 {d1, d3, d5}, [%2]! \n" "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2"); } void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width) { asm volatile( "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV // pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV // pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average "vqrshrun.s16 d0, q1, #2 \n" "subs %3, %3, #16 \n" // 16 processed per loop. "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV. "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_stride_ayuv), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); } void AYUVToVURow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_vu, int width) { asm volatile( "add %1, %0, %1 \n" // src_stride + src_AYUV "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV // pixels. "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV // pixels. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV // pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average "vqrshrun.s16 d1, q1, #2 \n" "subs %3, %3, #16 \n" // 16 processed per loop. "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_stride_ayuv), // %1 "+r"(dst_vu), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); } // Copy row of AYUV Y's into Y. // Similar to ARGBExtractAlphaRow_NEON void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels "subs %2, %2, #16 \n" // 16 processed per loop "vst1.8 {q2}, [%1]! \n" // store 16 Y's. "bgt 1b \n" : "+r"(src_ayuv), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2", "q3"); } // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values "vld2.8 {d1, d3}, [%0]! \n" "vorr.u8 q2, q0, q0 \n" // move U after V "subs %2, %2, #16 \n" // 16 pixels per loop "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 : : "cc", "memory", "q0", "q1", "q2"); } void HalfMergeUVRow_NEON(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int width) { const uint8_t* src_u_1 = src_u + src_stride_u; const uint8_t* src_v_1 = src_v + src_stride_v; asm volatile( "1: \n" "vld1.8 {q0}, [%0]! \n" // load 16 U values "vld1.8 {q1}, [%2]! \n" // load 16 V values "vld1.8 {q2}, [%1]! \n" "vld1.8 {q3}, [%3]! \n" "vpaddl.u8 q0, q0 \n" // half size "vpaddl.u8 q1, q1 \n" "vpadal.u8 q0, q2 \n" "vpadal.u8 q1, q3 \n" "vqrshrn.u16 d0, q0, #2 \n" "vqrshrn.u16 d1, q1, #2 \n" "subs %5, %5, #16 \n" // 16 src pixels per loop "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels "bgt 1b \n" : "+r"(src_u), // %0 "+r"(src_u_1), // %1 "+r"(src_v), // %2 "+r"(src_v_1), // %3 "+r"(dst_uv), // %4 "+r"(width) // %5 : : "cc", "memory", "q0", "q1", "q2", "q3"); } void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width) { int shift = depth - 16; // Negative for right shift. asm volatile( "vdup.16 q2, %4 \n" "1: \n" "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV "vshl.u16 q0, q0, q2 \n" "vshl.u16 q1, q1, q2 \n" "subs %3, %3, #8 \n" // 8 src pixels per loop "vst1.16 {q0}, [%1]! \n" // store 8 U pixels "vst1.16 {q1}, [%2]! \n" // store 8 V pixels "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : "r"(shift) // %4 : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); } void MergeUVRow_16_NEON(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, int depth, int width) { int shift = 16 - depth; asm volatile( "vdup.16 q2, %4 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" // load 8 U "vld1.16 {q1}, [%1]! \n" // load 8 V "vshl.u16 q0, q0, q2 \n" "vshl.u16 q1, q1, q2 \n" "subs %3, %3, #8 \n" // 8 src pixels per loop "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels "bgt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 : "r"(shift) // %4 : "cc", "memory", "q0", "q1", "q2"); } void MultiplyRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { asm volatile( "vdup.16 q2, %2 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" "vmul.u16 q0, q0, q2 \n" "vmul.u16 q1, q1, q2 \n" "vst1.16 {q0}, [%1]! \n" "vst1.16 {q1}, [%1]! \n" "subs %3, %3, #16 \n" // 16 src pixels per loop "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(scale), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2"); } void DivideRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { asm volatile( "vdup.16 q0, %2 \n" "1: \n" "vld1.16 {q1}, [%0]! \n" "vld1.16 {q2}, [%0]! \n" "vmovl.u16 q3, d2 \n" "vmovl.u16 q1, d3 \n" "vmovl.u16 q4, d4 \n" "vmovl.u16 q2, d5 \n" "vshl.u32 q3, q3, q0 \n" "vshl.u32 q4, q4, q0 \n" "vshl.u32 q1, q1, q0 \n" "vshl.u32 q2, q2, q0 \n" "vmovn.u32 d2, q3 \n" "vmovn.u32 d3, q1 \n" "vmovn.u32 d4, q4 \n" "vmovn.u32 d5, q2 \n" "vst1.16 {q1}, [%1]! \n" "vst1.16 {q2}, [%1]! \n" "subs %3, %3, #16 \n" // 16 src pixels per loop "bgt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(scale), // %2 "+r"(width) // %3 : : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/row_neon64.cc000066400000000000000000006051001416500237200213370ustar00rootroot00000000000000/* * Copyright 2014 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer // STn over ZIP1+ST1 // Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions. // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // v0.8h: Y // v1.16b: 8U, 8V // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ "ldr d0, [%[src_y]], #8 \n" \ "ld1 {v1.s}[0], [%[src_u]], #4 \n" \ "ld1 {v1.s}[1], [%[src_v]], #4 \n" \ "zip1 v0.16b, v0.16b, v0.16b \n" \ "prfm pldl1keep, [%[src_y], 448] \n" \ "zip1 v1.16b, v1.16b, v1.16b \n" \ "prfm pldl1keep, [%[src_u], 128] \n" \ "prfm pldl1keep, [%[src_v], 128] \n" // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ "ldr d0, [%[src_y]], #8 \n" \ "ld1 {v1.d}[0], [%[src_u]], #8 \n" \ "prfm pldl1keep, [%[src_y], 448] \n" \ "ld1 {v1.d}[1], [%[src_v]], #8 \n" \ "prfm pldl1keep, [%[src_u], 448] \n" \ "zip1 v0.16b, v0.16b, v0.16b \n" \ "prfm pldl1keep, [%[src_v], 448] \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ "ldr d0, [%[src_y]], #8 \n" \ "movi v1.16b, #128 \n" \ "prfm pldl1keep, [%[src_y], 448] \n" \ "zip1 v0.16b, v0.16b, v0.16b \n" static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6, 1, 1, 3, 3, 5, 5, 7, 7}; static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7, 0, 0, 2, 2, 4, 4, 6, 6}; // Read 8 Y and 4 UV from NV12 or NV21 #define READNV12 \ "ldr d0, [%[src_y]], #8 \n" \ "ldr d1, [%[src_uv]], #8 \n" \ "zip1 v0.16b, v0.16b, v0.16b \n" \ "prfm pldl1keep, [%[src_y], 448] \n" \ "tbl v1.16b, {v1.16b}, v2.16b \n" \ "prfm pldl1keep, [%[src_uv], 448] \n" // Read 8 YUY2 #define READYUY2 \ "ld2 {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \ "zip1 v0.16b, v0.16b, v0.16b \n" \ "prfm pldl1keep, [%[src_yuy2], 448] \n" \ "tbl v1.16b, {v1.16b}, v2.16b \n" // Read 8 UYVY #define READUYVY \ "ld2 {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \ "zip1 v0.16b, v4.16b, v4.16b \n" \ "prfm pldl1keep, [%[src_uyvy], 448] \n" \ "tbl v1.16b, {v3.16b}, v2.16b \n" // UB VR UG VG // YG BB BG BR #define YUVTORGB_SETUP \ "ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \ "ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n" // v16.8h: B // v17.8h: G // v18.8h: R // Convert from YUV to 2.14 fixed point RGB #define YUVTORGB \ "umull2 v3.4s, v0.8h, v24.8h \n" \ "umull v6.8h, v1.8b, v30.8b \n" \ "umull v0.4s, v0.4h, v24.4h \n" \ "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \ "uqshrn v0.4h, v0.4s, #16 \n" \ "uqshrn2 v0.8h, v3.4s, #16 \n" /* Y */ \ "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \ "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \ "add v17.8h, v0.8h, v26.8h \n" /* G */ \ "add v16.8h, v0.8h, v4.8h \n" /* B */ \ "add v18.8h, v0.8h, v5.8h \n" /* R */ \ "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \ "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \ "uqsub v18.8h, v18.8h, v27.8h \n" /* R */ // Convert from 2.14 fixed point RGB To 8 bit RGB #define RGBTORGB8 \ "uqshrn v17.8b, v17.8h, #6 \n" \ "uqshrn v16.8b, v16.8h, #6 \n" \ "uqshrn v18.8b, v18.8h, #6 \n" #define YUVTORGB_REGS \ "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \ "v26", "v27", "v28", "v29", "v30", "v31" void I444ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ "1: \n" READYUV444 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "v19"); } void I422ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" /* A */ "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "v19"); } void I444AlphaToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "1: \n" "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444 "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [src_a] "+r"(src_a), // %[src_a] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "v19"); } void I422AlphaToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, const uint8_t* src_a, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "1: \n" "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422 "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [src_a] "+r"(src_a), // %[src_a] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "v19"); } void I422ToRGBARow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "movi v15.8b, #255 \n" /* A */ "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_rgba] "+r"(dst_rgba), // %[dst_rgba] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "v15"); } void I422ToRGB24Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS); } #define ARGBTORGB565 \ "shll v18.8h, v18.8b, #8 \n" /* R */ \ "shll v17.8h, v17.8b, #8 \n" /* G */ \ "shll v16.8h, v16.8b, #8 \n" /* B */ \ "sri v18.8h, v17.8h, #5 \n" /* RG */ \ "sri v18.8h, v16.8h, #11 \n" /* RGB */ void I422ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTORGB565 "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS); } #define ARGBTOARGB1555 \ "shll v0.8h, v19.8b, #8 \n" /* A */ \ "shll v18.8h, v18.8b, #8 \n" /* R */ \ "shll v17.8h, v17.8b, #8 \n" /* G */ \ "shll v16.8h, v16.8b, #8 \n" /* B */ \ "sri v0.8h, v18.8h, #1 \n" /* AR */ \ "sri v0.8h, v17.8h, #6 \n" /* ARG */ \ "sri v0.8h, v16.8h, #11 \n" /* ARGB */ void I422ToARGB1555Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb1555, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTOARGB1555 "st1 {v0.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels // RGB565. "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "v19"); } #define ARGBTOARGB4444 \ /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f */ \ "ushr v16.8b, v16.8b, #4 \n" /* B */ \ "bic v17.8b, v17.8b, v23.8b \n" /* G */ \ "ushr v18.8b, v18.8b, #4 \n" /* R */ \ "bic v19.8b, v19.8b, v23.8b \n" /* A */ \ "orr v0.8b, v16.8b, v17.8b \n" /* BG */ \ "orr v1.8b, v18.8b, v19.8b \n" /* RA */ \ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ void I422ToARGB4444Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb4444, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "movi v23.16b, #0x0f \n" // bits to clear with // vbic. "1: \n" READYUV422 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "movi v19.8b, #255 \n" ARGBTOARGB4444 "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8 // pixels // ARGB4444. "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "v19", "v23"); } void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" "1: \n" READYUV400 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias] : "cc", "memory", YUVTORGB_REGS, "v19"); } #if LIBYUV_USE_ST4 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( "movi v23.8b, #255 \n" "1: \n" "ld1 {v20.8b}, [%0], #8 \n" "prfm pldl1keep, [%0, 448] \n" "orr v21.8b, v20.8b, v20.8b \n" "orr v22.8b, v20.8b, v20.8b \n" "subs %w2, %w2, #8 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "v20", "v21", "v22", "v23"); } #else void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { asm volatile( "movi v20.8b, #255 \n" "1: \n" "ldr d16, [%0], #8 \n" "subs %w2, %w2, #8 \n" "zip1 v18.16b, v16.16b, v16.16b \n" // YY "zip1 v19.16b, v16.16b, v20.16b \n" // YA "prfm pldl1keep, [%0, 448] \n" "zip1 v16.16b, v18.16b, v19.16b \n" // YYYA "zip2 v17.16b, v18.16b, v19.16b \n" "stp q16, q17, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "v16", "v17", "v18", "v19", "v20"); } #endif // LIBYUV_USE_ST4 void NV12ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] [kNV12Table] "r"(&kNV12Table) : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } void NV21ToARGBRow_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_vu), // %[src_uv] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] [kNV12Table] "r"(&kNV21Table) : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } void NV12ToRGB24Row_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] [kNV12Table] "r"(&kNV12Table) : "cc", "memory", YUVTORGB_REGS, "v2"); } void NV21ToRGB24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n" "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_vu), // %[src_uv] [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] [kNV12Table] "r"(&kNV21Table) : "cc", "memory", YUVTORGB_REGS, "v2"); } void NV12ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" "1: \n" READNV12 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTORGB565 "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 // pixels // RGB565. "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] [kNV12Table] "r"(&kNV12Table) : "cc", "memory", YUVTORGB_REGS, "v2"); } void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" "1: \n" READYUY2 YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] [kNV12Table] "r"(&kNV12Table) : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } void UYVYToARGBRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { asm volatile( YUVTORGB_SETUP "movi v19.8b, #255 \n" "ldr q2, [%[kNV12Table]] \n" "1: \n" READUYVY YUVTORGB RGBTORGB8 "subs %w[width], %w[width], #8 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "b.gt 1b \n" : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2] [dst_argb] "+r"(dst_argb), // %[dst_argb] [width] "+r"(width) // %[width] : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff] [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias] [kNV12Table] "r"(&kNV12Table) : "cc", "memory", YUVTORGB_REGS, "v2", "v19"); } // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. void SplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV "subs %w3, %w3, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store U "st1 {v1.16b}, [%2], #16 \n" // store V "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 // Output registers : // Input registers : "cc", "memory", "v0", "v1" // Clobber List ); } #if LIBYUV_USE_ST2 // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V "subs %w3, %w3, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 // Output registers : // Input registers : "cc", "memory", "v0", "v1" // Clobber List ); } void MergeUVRow_16_NEON(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, int depth, int width) { int shift = 16 - depth; asm volatile( "dup v2.8h, %w4 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U "subs %w3, %w3, #8 \n" // 8 src pixels per loop "ld1 {v1.8h}, [%1], #16 \n" // load 8 V "ushl v0.8h, v0.8h, v2.8h \n" "prfm pldl1keep, [%0, 448] \n" "ushl v1.8h, v1.8h, v2.8h \n" "prfm pldl1keep, [%1, 448] \n" "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 : "r"(shift) // %4 : "cc", "memory", "v0", "v1", "v2"); } #else // Reads 16 U's and V's and writes out 16 pairs of UV. void MergeUVRow_NEON(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V "subs %w3, %w3, #16 \n" // 16 processed per loop "zip1 v2.16b, v0.16b, v1.16b \n" "prfm pldl1keep, [%0, 448] \n" "zip2 v3.16b, v0.16b, v1.16b \n" "prfm pldl1keep, [%1, 448] \n" "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 // Output registers : // Input registers : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } void MergeUVRow_16_NEON(const uint16_t* src_u, const uint16_t* src_v, uint16_t* dst_uv, int depth, int width) { int shift = 16 - depth; asm volatile( "dup v4.8h, %w4 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U "subs %w3, %w3, #8 \n" // 8 src pixels per loop "ld1 {v1.8h}, [%1], #16 \n" // load 8 V "ushl v0.8h, v0.8h, v4.8h \n" "ushl v1.8h, v1.8h, v4.8h \n" "prfm pldl1keep, [%0, 448] \n" "zip1 v2.8h, v0.8h, v1.8h \n" "zip2 v3.8h, v0.8h, v1.8h \n" "prfm pldl1keep, [%1, 448] \n" "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 : "r"(shift) // %4 : "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4"); } #endif // LIBYUV_USE_ST2 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. void SplitRGBRow_NEON(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { asm volatile( "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store R "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%3], #16 \n" // store B "b.gt 1b \n" : "+r"(src_rgb), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(width) // %4 : // Input registers : "cc", "memory", "v0", "v1", "v2" // Clobber List ); } // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time void MergeRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_rgb, int width) { asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v2.16b}, [%2], #16 \n" // load B "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_rgb), // %3 "+r"(width) // %4 : // Input registers : "cc", "memory", "v0", "v1", "v2" // Clobber List ); } // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a. void SplitARGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, uint8_t* dst_a, int width) { asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w5, %w5, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%3], #16 \n" // store B "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%1], #16 \n" // store R "st1 {v3.16b}, [%4], #16 \n" // store A "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(dst_a), // %4 "+r"(width) // %5 : // Input registers : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #if LIBYUV_USE_ST4 // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, uint8_t* dst_argb, int width) { asm volatile( "1: \n" "ld1 {v0.16b}, [%2], #16 \n" // load B "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v2.16b}, [%0], #16 \n" // load R "ld1 {v3.16b}, [%3], #16 \n" // load A "subs %w5, %w5, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" "prfm pldl1keep, [%3, 448] \n" "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(src_a), // %3 "+r"(dst_argb), // %4 "+r"(width) // %5 : // Input registers : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #else // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time void MergeARGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, const uint8_t* src_a, uint8_t* dst_argb, int width) { asm volatile( "1: \n" "ld1 {v0.16b}, [%2], #16 \n" // load B "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v2.16b}, [%0], #16 \n" // load R "ld1 {v3.16b}, [%3], #16 \n" // load A "subs %w5, %w5, #16 \n" // 16 processed per loop "prfm pldl1keep, [%2, 448] \n" "zip1 v4.16b, v0.16b, v1.16b \n" // BG "zip1 v5.16b, v2.16b, v3.16b \n" // RA "prfm pldl1keep, [%1, 448] \n" "zip2 v6.16b, v0.16b, v1.16b \n" // BG "zip2 v7.16b, v2.16b, v3.16b \n" // RA "prfm pldl1keep, [%0, 448] \n" "zip1 v0.8h, v4.8h, v5.8h \n" // BGRA "zip2 v1.8h, v4.8h, v5.8h \n" "prfm pldl1keep, [%3, 448] \n" "zip1 v2.8h, v6.8h, v7.8h \n" "zip2 v3.8h, v6.8h, v7.8h \n" "st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(src_a), // %3 "+r"(dst_argb), // %4 "+r"(width) // %5 : // Input registers : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } #endif // LIBYUV_USE_ST4 // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b. void SplitXRGBRow_NEON(const uint8_t* src_rgba, uint8_t* dst_r, uint8_t* dst_g, uint8_t* dst_b, int width) { asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%3], #16 \n" // store B "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%1], #16 \n" // store R "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_r), // %1 "+r"(dst_g), // %2 "+r"(dst_b), // %3 "+r"(width) // %4 : // Input registers : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } // Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time void MergeXRGBRow_NEON(const uint8_t* src_r, const uint8_t* src_g, const uint8_t* src_b, uint8_t* dst_argb, int width) { asm volatile( "movi v3.16b, #255 \n" // load A(255) "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v0.16b}, [%2], #16 \n" // load B "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 : // Input registers : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } void MergeXR30Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_ar30, int depth, int width) { int shift = 10 - depth; asm volatile( "movi v30.16b, #255 \n" "ushr v30.4s, v30.4s, #22 \n" // 1023 "dup v31.4s, %w5 \n" "1: \n" "ldr d2, [%2], #8 \n" // B "ldr d1, [%1], #8 \n" // G "ldr d0, [%0], #8 \n" // R "ushll v2.4s, v2.4h, #0 \n" // B "ushll v1.4s, v1.4h, #0 \n" // G "ushll v0.4s, v0.4h, #0 \n" // R "ushl v2.4s, v2.4s, v31.4s \n" // 000B "ushl v1.4s, v1.4s, v31.4s \n" // G "ushl v0.4s, v0.4s, v31.4s \n" // R "umin v2.4s, v2.4s, v30.4s \n" "umin v1.4s, v1.4s, v30.4s \n" "umin v0.4s, v0.4s, v30.4s \n" "sli v2.4s, v1.4s, #10 \n" // 00GB "sli v2.4s, v0.4s, #20 \n" // 0RGB "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) "subs %w4, %w4, #4 \n" "str q2, [%3], #16 \n" "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_ar30), // %3 "+r"(width) // %4 : "r"(shift) // %5 : "memory", "cc", "v0", "v1", "v2", "v30", "v31"); } void MergeXR30Row_10_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_ar30, int /* depth */, int width) { asm volatile( "movi v30.16b, #255 \n" "ushr v30.4s, v30.4s, #22 \n" // 1023 "1: \n" "ldr d2, [%2], #8 \n" // B "ldr d1, [%1], #8 \n" // G "ldr d0, [%0], #8 \n" // R "ushll v2.4s, v2.4h, #0 \n" // 000B "ushll v1.4s, v1.4h, #0 \n" // G "ushll v0.4s, v0.4h, #0 \n" // R "umin v2.4s, v2.4s, v30.4s \n" "umin v1.4s, v1.4s, v30.4s \n" "umin v0.4s, v0.4s, v30.4s \n" "sli v2.4s, v1.4s, #10 \n" // 00GB "sli v2.4s, v0.4s, #20 \n" // 0RGB "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30) "subs %w4, %w4, #4 \n" "str q2, [%3], #16 \n" "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_ar30), // %3 "+r"(width) // %4 : : "memory", "cc", "v0", "v1", "v2", "v30"); } void MergeAR64Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint16_t* dst_ar64, int depth, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; asm volatile( "dup v30.8h, %w7 \n" "dup v31.8h, %w6 \n" "1: \n" "ldr q2, [%0], #16 \n" // R "ldr q1, [%1], #16 \n" // G "ldr q0, [%2], #16 \n" // B "ldr q3, [%3], #16 \n" // A "umin v2.8h, v2.8h, v30.8h \n" "prfm pldl1keep, [%0, 448] \n" "umin v1.8h, v1.8h, v30.8h \n" "prfm pldl1keep, [%1, 448] \n" "umin v0.8h, v0.8h, v30.8h \n" "prfm pldl1keep, [%2, 448] \n" "umin v3.8h, v3.8h, v30.8h \n" "prfm pldl1keep, [%3, 448] \n" "ushl v2.8h, v2.8h, v31.8h \n" "ushl v1.8h, v1.8h, v31.8h \n" "ushl v0.8h, v0.8h, v31.8h \n" "ushl v3.8h, v3.8h, v31.8h \n" "subs %w5, %w5, #8 \n" "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n" "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(src_a), // %3 "+r"(dst_ar64), // %4 "+r"(width) // %5 : "r"(shift), // %6 "r"(mask) // %7 : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); } void MergeXR64Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint16_t* dst_ar64, int depth, int width) { int shift = 16 - depth; int mask = (1 << depth) - 1; asm volatile( "movi v3.16b, #0xff \n" // A (0xffff) "dup v30.8h, %w6 \n" "dup v31.8h, %w5 \n" "1: \n" "ldr q2, [%0], #16 \n" // R "ldr q1, [%1], #16 \n" // G "ldr q0, [%2], #16 \n" // B "umin v2.8h, v2.8h, v30.8h \n" "prfm pldl1keep, [%0, 448] \n" "umin v1.8h, v1.8h, v30.8h \n" "prfm pldl1keep, [%1, 448] \n" "umin v0.8h, v0.8h, v30.8h \n" "prfm pldl1keep, [%2, 448] \n" "ushl v2.8h, v2.8h, v31.8h \n" "ushl v1.8h, v1.8h, v31.8h \n" "ushl v0.8h, v0.8h, v31.8h \n" "subs %w4, %w4, #8 \n" "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n" "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_ar64), // %3 "+r"(width) // %4 : "r"(shift), // %5 "r"(mask) // %6 : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); } void MergeARGB16To8Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, const uint16_t* src_a, uint8_t* dst_argb, int depth, int width) { int shift = 8 - depth; asm volatile( "dup v31.8h, %w6 \n" "1: \n" "ldr q2, [%0], #16 \n" // R "ldr q1, [%1], #16 \n" // G "ldr q0, [%2], #16 \n" // B "ldr q3, [%3], #16 \n" // A "ushl v2.8h, v2.8h, v31.8h \n" "prfm pldl1keep, [%0, 448] \n" "ushl v1.8h, v1.8h, v31.8h \n" "prfm pldl1keep, [%1, 448] \n" "ushl v0.8h, v0.8h, v31.8h \n" "prfm pldl1keep, [%2, 448] \n" "ushl v3.8h, v3.8h, v31.8h \n" "prfm pldl1keep, [%3, 448] \n" "uqxtn v2.8b, v2.8h \n" "uqxtn v1.8b, v1.8h \n" "uqxtn v0.8b, v0.8h \n" "uqxtn v3.8b, v3.8h \n" "subs %w5, %w5, #8 \n" "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n" "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(src_a), // %3 "+r"(dst_argb), // %4 "+r"(width) // %5 : "r"(shift) // %6 : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); } void MergeXRGB16To8Row_NEON(const uint16_t* src_r, const uint16_t* src_g, const uint16_t* src_b, uint8_t* dst_argb, int depth, int width) { int shift = 8 - depth; asm volatile( "dup v31.8h, %w5 \n" "movi v3.8b, #0xff \n" // A (0xff) "1: \n" "ldr q2, [%0], #16 \n" // R "ldr q1, [%1], #16 \n" // G "ldr q0, [%2], #16 \n" // B "ushl v2.8h, v2.8h, v31.8h \n" "prfm pldl1keep, [%0, 448] \n" "ushl v1.8h, v1.8h, v31.8h \n" "prfm pldl1keep, [%1, 448] \n" "ushl v0.8h, v0.8h, v31.8h \n" "prfm pldl1keep, [%2, 448] \n" "uqxtn v2.8b, v2.8h \n" "uqxtn v1.8b, v1.8h \n" "uqxtn v0.8b, v0.8h \n" "subs %w4, %w4, #8 \n" "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 "+r"(src_b), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 : "r"(shift) // %5 : "memory", "cc", "v0", "v1", "v2", "v3", "v31"); } // Copy multiple of 32. void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "1: \n" "ldp q0, q1, [%0], #32 \n" "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #32 \n" // 32 processed per loop "stp q0, q1, [%1], #32 \n" "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 // Output registers : // Input registers : "cc", "memory", "v0", "v1" // Clobber List ); } // SetRow writes 'width' bytes using an 8 bit value repeated. void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { asm volatile( "dup v0.16b, %w2 \n" // duplicate 16 bytes "1: \n" "subs %w1, %w1, #16 \n" // 16 bytes per loop "st1 {v0.16b}, [%0], #16 \n" // store "b.gt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v8) // %2 : "cc", "memory", "v0"); } void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { asm volatile( "dup v0.4s, %w2 \n" // duplicate 4 ints "1: \n" "subs %w1, %w1, #4 \n" // 4 ints per loop "st1 {v0.16b}, [%0], #16 \n" // store "b.gt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 : "r"(v32) // %2 : "cc", "memory", "v0"); } // Shuffle table for reversing the bytes. static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( // Start at end of source row. "ld1 {v3.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw \n" "sub %0, %0, #32 \n" "1: \n" "ldr q2, [%0, 16] \n" "ldr q1, [%0], -32 \n" // src -= 32 "subs %w2, %w2, #32 \n" // 32 pixels per loop. "tbl v0.16b, {v2.16b}, v3.16b \n" "tbl v1.16b, {v1.16b}, v3.16b \n" "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "r"(&kShuffleMirror) // %3 : "cc", "memory", "v0", "v1", "v2", "v3"); } // Shuffle table for reversing the UV. static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u}; void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { asm volatile( // Start at end of source row. "ld1 {v4.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw #1 \n" "sub %0, %0, #32 \n" "1: \n" "ldr q1, [%0, 16] \n" "ldr q0, [%0], -32 \n" // src -= 32 "subs %w2, %w2, #16 \n" // 16 pixels per loop. "tbl v2.16b, {v1.16b}, v4.16b \n" "tbl v3.16b, {v0.16b}, v4.16b \n" "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(width) // %2 : "r"(&kShuffleMirrorUV) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( // Start at end of source row. "ld1 {v4.16b}, [%4] \n" // shuffler "add %0, %0, %w3, sxtw #1 \n" "sub %0, %0, #32 \n" "1: \n" "ldr q1, [%0, 16] \n" "ldr q0, [%0], -32 \n" // src -= 32 "subs %w3, %w3, #16 \n" // 16 pixels per loop. "tbl v2.16b, {v1.16b}, v4.16b \n" "tbl v3.16b, {v0.16b}, v4.16b \n" "uzp1 v0.16b, v2.16b, v3.16b \n" // U "uzp2 v1.16b, v2.16b, v3.16b \n" // V "st1 {v0.16b}, [%1], #16 \n" // dst += 16 "st1 {v1.16b}, [%2], #16 \n" "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : "r"(&kShuffleMirrorUV) // %4 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } // Shuffle table for reversing the ARGB. static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u}; void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( // Start at end of source row. "ld1 {v4.16b}, [%3] \n" // shuffler "add %0, %0, %w2, sxtw #2 \n" "sub %0, %0, #32 \n" "1: \n" "ldr q1, [%0, 16] \n" "ldr q0, [%0], -32 \n" // src -= 32 "subs %w2, %w2, #8 \n" // 8 pixels per loop. "tbl v2.16b, {v1.16b}, v4.16b \n" "tbl v3.16b, {v0.16b}, v4.16b \n" "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32 "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(&kShuffleMirrorARGB) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { asm volatile( "ld1 {v3.16b}, [%4] \n" // shuffler "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. "add %0, %0, %w2, sxtw \n" "sub %0, %0, #48 \n" "1: \n" "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48 "subs %w2, %w2, #16 \n" // 16 pixels per loop. "tbl v0.16b, {v0.16b}, v3.16b \n" "tbl v1.16b, {v1.16b}, v3.16b \n" "tbl v2.16b, {v2.16b}, v3.16b \n" "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48 "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 : "r"((ptrdiff_t)-48), // %3 "r"(&kShuffleMirror) // %4 : "cc", "memory", "v0", "v1", "v2", "v3"); } void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( "movi v4.8b, #255 \n" // Alpha "1: \n" "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of // RGB24. "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List ); } void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { asm volatile( "movi v5.8b, #255 \n" // Alpha "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v3.8b, v1.8b, v1.8b \n" // move g "prfm pldl1keep, [%0, 448] \n" "orr v4.8b, v0.8b, v0.8b \n" // move r "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { asm volatile( "movi v0.8b, #255 \n" // Alpha "1: \n" "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v2.8b, v4.8b, v4.8b \n" // move g "prfm pldl1keep, [%0, 448] \n" "orr v1.8b, v5.8b, v5.8b \n" // move r "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgba), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v3.8b, v1.8b, v1.8b \n" // move g "prfm pldl1keep, [%0, 448] \n" "orr v4.8b, v0.8b, v0.8b \n" // move r "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List ); } #define RGB565TOARGB \ "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \ "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \ "orr v1.8b, v4.8b, v6.8b \n" /* G */ \ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \ "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \ "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \ "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \ "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ "dup v2.2D, v0.D[1] \n" /* R */ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { asm volatile( "movi v3.8b, #255 \n" // Alpha "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List ); } #define ARGB1555TOARGB \ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \ \ "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \ "xtn2 v3.16b, v2.8h \n" \ \ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ \ "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ \ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \ "dup v1.2D, v0.D[1] \n" \ "dup v3.2D, v2.D[1] \n" // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. #define RGB555TOARGB \ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \ "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \ \ "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \ "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \ \ "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \ "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \ "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \ \ "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \ "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ "dup v1.2D, v0.D[1] \n" /* G */ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { asm volatile( "movi v3.8b, #255 \n" // Alpha "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } // Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b // clobbers v3 #define ARGB4444TOARGB \ "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \ "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \ "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \ "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \ "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \ "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \ "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \ "dup v0.2D, v2.D[1] \n" \ "dup v1.2D, v3.D[1] \n" void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List ); } void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w2, %w2, #16 \n" // 16 pixels per loop. "prfm pldl1keep, [%0, 448] \n" "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" // store 8 RGB24 "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v4.8b, v2.8b, v2.8b \n" // mov g "prfm pldl1keep, [%0, 448] \n" "orr v5.8b, v1.8b, v1.8b \n" // mov b "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %w2, %w2, #16 \n" // 16 processed per loop. "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1" // Clobber List ); } void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %w2, %w2, #16 \n" // 16 processed per loop. "prfm pldl1keep, [%0, 448] \n" "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1" // Clobber List ); } void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. "prfm pldl1keep, [%0, 448] \n" "st1 {v1.8b}, [%1], #8 \n" // store 8 U. "st1 {v3.8b}, [%2], #8 \n" // store 8 V. "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. "prfm pldl1keep, [%0, 448] \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 U. "st1 {v2.8b}, [%2], #8 \n" // store 8 V. "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U "prfm pldl1keep, [%0, 448] \n" "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V "st1 {v1.8b}, [%2], #8 \n" // store 8 U. "st1 {v3.8b}, [%3], #8 \n" // store 8 V. "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(src_yuy2b), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } void UYVYToUVRow_NEON(const uint8_t* src_uyvy, int stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_uyvyb = src_uyvy + stride_uyvy; asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U "prfm pldl1keep, [%0, 448] \n" "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V "st1 {v0.8b}, [%2], #8 \n" // store 8 U. "st1 {v2.8b}, [%3], #8 \n" // store 8 V. "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(src_uyvyb), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { asm volatile( "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. "subs %w2, %w2, #4 \n" // 4 processed per loop "prfm pldl1keep, [%0, 448] \n" "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels "st1 {v1.16b}, [%1], #16 \n" // store 4. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(shuffler) // %3 : "cc", "memory", "v0", "v1", "v2" // Clobber List ); } void I422ToYUY2Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { asm volatile( "1: \n" "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys "subs %w4, %w4, #16 \n" // 16 pixels "orr v2.8b, v1.8b, v1.8b \n" "prfm pldl1keep, [%0, 448] \n" "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_yuy2), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3"); } void I422ToUYVYRow_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { asm volatile( "1: \n" "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys "orr v3.8b, v2.8b, v2.8b \n" "prfm pldl1keep, [%0, 448] \n" "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "subs %w4, %w4, #16 \n" // 16 pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 "+r"(dst_uyvy), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3"); } void ARGBToRGB565Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb565, int width) { asm volatile( "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565 "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 "+r"(width) // %2 : : "cc", "memory", "v16", "v17", "v18", "v19"); } void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width) { asm volatile( "dup v1.4s, %w2 \n" // dither4 "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // load 8 // pixels "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v16.8b, v16.8b, v1.8b \n" "prfm pldl1keep, [%0, 448] \n" "uqadd v17.8b, v17.8b, v1.8b \n" "uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565 "st1 {v18.16b}, [%0], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" : "+r"(dst_rgb) // %0 : "r"(src_argb), // %1 "r"(dither4), // %2 "r"(width) // %3 : "cc", "memory", "v1", "v16", "v17", "v18", "v19"); } void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb1555, int width) { asm volatile( "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v16", "v17", "v18", "v19"); } void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, uint8_t* dst_argb4444, int width) { asm volatile( "movi v23.16b, #0x0f \n" // bits to clear with // vbic. "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 // pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23"); } #if LIBYUV_USE_ST2 void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { asm volatile( "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels "mov v1.16b, v0.16b \n" "prfm pldl1keep, [%0, 448] \n" "mov v3.16b, v2.16b \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_ar64), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3"); } static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { asm volatile( "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels "tbl v0.16b, {v0.16b}, v4.16b \n" "tbl v2.16b, {v2.16b}, v4.16b \n" "prfm pldl1keep, [%0, 448] \n" "mov v1.16b, v0.16b \n" "mov v3.16b, v2.16b \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_ab64), // %1 "+r"(width) // %2 : "r"(&kShuffleARGBToABGR) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } #else void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { asm volatile( "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "zip1 v2.16b, v0.16b, v0.16b \n" "zip2 v3.16b, v0.16b, v0.16b \n" "prfm pldl1keep, [%0, 448] \n" "zip1 v4.16b, v1.16b, v1.16b \n" "zip2 v5.16b, v1.16b, v1.16b \n" "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64 "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_ar64), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); } static const uvec8 kShuffleARGBToAB64[2] = { {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7}, {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}}; void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { asm volatile( "ldp q6, q7, [%3] \n" // 2 shufflers "1: \n" "ldp q0, q1, [%0], #32 \n" // load 8 pixels "subs %w2, %w2, #8 \n" // 8 processed per loop. "tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64 "tbl v3.16b, {v0.16b}, v7.16b \n" "prfm pldl1keep, [%0, 448] \n" "tbl v4.16b, {v1.16b}, v6.16b \n" "tbl v5.16b, {v1.16b}, v7.16b \n" "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64 "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_ab64), // %1 "+r"(width) // %2 : "r"(&kShuffleARGBToAB64[0]) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } #endif // LIBYUV_USE_ST2 static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { asm volatile( "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" "prfm pldl1keep, [%0, 448] \n" "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "stp q0, q2, [%1], #32 \n" // store 8 pixels "b.gt 1b \n" : "+r"(src_ar64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(&kShuffleAR64ToARGB) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15, 21, 19, 17, 23, 29, 27, 25, 31}; void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { asm volatile( "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" "prfm pldl1keep, [%0, 448] \n" "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "stp q0, q2, [%1], #32 \n" // store 8 pixels "b.gt 1b \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(&kShuffleAB64ToARGB) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "movi v4.8b, #25 \n" // B * 0.1016 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient "movi v6.8b, #66 \n" // R * 0.2578 coefficient "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y "uqadd v0.8b, v0.8b, v7.8b \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, uint8_t* dst_a, int width) { asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 processed per loop "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "movi v4.8b, #29 \n" // B * 0.1140 coefficient "movi v5.8b, #150 \n" // G * 0.5870 coefficient "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "movi v4.8b, #29 \n" // B * 0.1140 coefficient "movi v5.8b, #150 \n" // G * 0.5870 coefficient "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v0.8h, v1.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v2.8b, v5.8b \n" // G "umlal v0.8h, v3.8b, v6.8b \n" // R "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y "st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // 8x1 pixels. void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { asm volatile( "movi v24.8b, #112 \n" // UB / VR 0.875 // coefficient "movi v25.8b, #74 \n" // UG -0.5781 coefficient "movi v26.8b, #38 \n" // UR -0.2969 coefficient "movi v27.8b, #18 \n" // VB -0.1406 coefficient "movi v28.8b, #94 \n" // VG -0.7344 coefficient "movi v29.16b,#0x80 \n" // 128.5 "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlsl v4.8h, v1.8b, v25.8b \n" // G "umlsl v4.8h, v2.8b, v26.8b \n" // R "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned "umull v3.8h, v2.8b, v24.8b \n" // R "umlsl v3.8h, v1.8b, v28.8b \n" // G "umlsl v3.8h, v0.8b, v27.8b \n" // B "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", "v27", "v28", "v29"); } #define RGBTOUV_SETUP_REG \ "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. // clang-format off #define RGBTOUV(QB, QG, QR) \ "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ "mls v3.8h, " #QG ",v21.8h \n" /* G */ \ "mls v4.8h, " #QG ",v24.8h \n" /* G */ \ "mls v3.8h, " #QR ",v22.8h \n" /* R */ \ "mls v4.8h, " #QB ",v23.8h \n" /* B */ \ "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \ "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \ "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \ "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */ // clang-format on // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): consider ptrdiff_t for all strides. void ARGBToUVRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. "urshr v0.8h, v0.8h, #1 \n" // 2x average "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } void ARGBToUVJRow_NEON(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; asm volatile ( "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. "urshr v0.8h, v0.8h, #1 \n" // 2x average "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. "urshr v0.8h, v0.8h, #1 \n" // 2x average "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(src_rgb24_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } void RAWToUVJRow_NEON(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_raw_1 = src_raw + src_stride_raw; asm volatile ( "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. "urshr v0.8h, v0.8h, #1 \n" // 2x average "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(src_raw_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } void BGRAToUVRow_NEON(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. "urshr v0.8h, v0.8h, #1 \n" // 2x average "urshr v1.8h, v3.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_bgra), // %0 "+r"(src_bgra_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } void ABGRToUVRow_NEON(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. "urshr v0.8h, v3.8h, #1 \n" // 2x average "urshr v2.8h, v2.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v2.8h, v1.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_abgr), // %0 "+r"(src_abgr_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } void RGBAToUVRow_NEON(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. "urshr v0.8h, v0.8h, #1 \n" // 2x average "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(src_rgba_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( RGBTOUV_SETUP_REG "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. "urshr v0.8h, v0.8h, #1 \n" // 2x average "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(src_rgb24_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } void RAWToUVRow_NEON(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_raw_1 = src_raw + src_stride_raw; asm volatile ( RGBTOUV_SETUP_REG "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels. "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. "urshr v2.8h, v2.8h, #1 \n" // 2x average "urshr v1.8h, v1.8h, #1 \n" "urshr v0.8h, v0.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(src_raw_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } // 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16. void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; asm volatile( RGBTOUV_SETUP_REG "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. RGB565TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. RGB565TOARGB "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. RGB565TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. RGB565TOARGB "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ins v16.D[1], v26.D[0] \n" "ins v17.D[1], v27.D[0] \n" "ins v18.D[1], v28.D[0] \n" "urshr v0.8h, v16.8h, #1 \n" // 2x average "urshr v1.8h, v17.8h, #1 \n" "urshr v2.8h, v18.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(src_rgb565_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555; asm volatile( RGBTOUV_SETUP_REG "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. RGB555TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. RGB555TOARGB "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. RGB555TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. RGB555TOARGB "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ins v16.D[1], v26.D[0] \n" "ins v17.D[1], v27.D[0] \n" "ins v18.D[1], v28.D[0] \n" "urshr v0.8h, v16.8h, #1 \n" // 2x average "urshr v1.8h, v17.8h, #1 \n" "urshr v2.8h, v18.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(src_argb1555_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"); } // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width) { const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444; asm volatile( RGBTOUV_SETUP_REG // sets v20-v25 "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts. "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ins v16.D[1], v26.D[0] \n" "ins v17.D[1], v27.D[0] \n" "ins v18.D[1], v28.D[0] \n" "urshr v0.8h, v16.8h, #1 \n" // 2x average "urshr v1.8h, v17.8h, #1 \n" "urshr v2.8h, v18.8h, #1 \n" "subs %w4, %w4, #16 \n" // 16 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(src_argb4444_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(width) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" ); } void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { asm volatile( "movi v24.8b, #25 \n" // B * 0.1016 coefficient "movi v25.8b, #129 \n" // G * 0.5078 coefficient "movi v26.8b, #66 \n" // R * 0.2578 coefficient "movi v27.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB "umull v3.8h, v0.8b, v24.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y "uqadd v0.8b, v0.8b, v27.8b \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26", "v27"); } void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width) { asm volatile( "movi v4.8b, #25 \n" // B * 0.1016 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient "movi v6.8b, #66 \n" // R * 0.2578 coefficient "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB "umull v3.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y "uqadd v0.8b, v0.8b, v7.8b \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width) { asm volatile( "movi v24.8b, #25 \n" // B * 0.1016 coefficient "movi v25.8b, #129 \n" // G * 0.5078 coefficient "movi v26.8b, #66 \n" // R * 0.2578 coefficient "movi v27.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB "umull v3.8h, v0.8b, v24.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y "uqadd v0.8b, v0.8b, v27.8b \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); } void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( "movi v4.8b, #66 \n" // R * 0.2578 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient "movi v6.8b, #25 \n" // B * 0.1016 coefficient "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // R "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v3.8b, v6.8b \n" // B "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y "uqadd v0.8b, v0.8b, v7.8b \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { asm volatile( "movi v6.8b, #25 \n" // B * 0.1016 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient "movi v4.8b, #66 \n" // R * 0.2578 coefficient "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // R "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // B "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y "uqadd v0.8b, v0.8b, v7.8b \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { asm volatile( "movi v4.8b, #25 \n" // B * 0.1016 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient "movi v6.8b, #66 \n" // R * 0.2578 coefficient "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v3.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y "uqadd v0.8b, v0.8b, v7.8b \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { asm volatile( "movi v4.8b, #25 \n" // B * 0.1016 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient "movi v6.8b, #66 \n" // R * 0.2578 coefficient "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y "uqadd v0.8b, v0.8b, v7.8b \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { asm volatile( "movi v6.8b, #25 \n" // B * 0.1016 coefficient "movi v5.8b, #129 \n" // G * 0.5078 coefficient "movi v4.8b, #66 \n" // R * 0.2578 coefficient "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y "uqadd v0.8b, v0.8b, v7.8b \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { asm volatile( "movi v4.8b, #29 \n" // B * 0.1140 coefficient "movi v5.8b, #150 \n" // G * 0.5870 coefficient "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v1.8b, v5.8b \n" // G "umlal v0.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_yj), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { asm volatile( "movi v6.8b, #29 \n" // B * 0.1140 coefficient "movi v5.8b, #150 \n" // G * 0.5870 coefficient "movi v4.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v1.8b, v5.8b \n" // G "umlal v0.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_yj), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { int y1_fraction = source_y_fraction; int y0_fraction = 256 - y1_fraction; const uint8_t* src_ptr1 = src_ptr + src_stride; asm volatile( "cmp %w4, #0 \n" "b.eq 100f \n" "cmp %w4, #128 \n" "b.eq 50f \n" "dup v5.16b, %w4 \n" "dup v4.16b, %w5 \n" // General purpose row blend. "1: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" "subs %w3, %w3, #16 \n" "umull v2.8h, v0.8b, v4.8b \n" "prfm pldl1keep, [%1, 448] \n" "umull2 v3.8h, v0.16b, v4.16b \n" "prfm pldl1keep, [%2, 448] \n" "umlal v2.8h, v1.8b, v5.8b \n" "umlal2 v3.8h, v1.16b, v5.16b \n" "rshrn v0.8b, v2.8h, #8 \n" "rshrn2 v0.16b, v3.8h, #8 \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 1b \n" "b 99f \n" // Blend 50 / 50. "50: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" "subs %w3, %w3, #16 \n" "prfm pldl1keep, [%1, 448] \n" "urhadd v0.16b, v0.16b, v1.16b \n" "prfm pldl1keep, [%2, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 50b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" "ld1 {v0.16b}, [%1], #16 \n" "subs %w3, %w3, #16 \n" "prfm pldl1keep, [%1, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 100b \n" "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_ptr1), // %2 "+r"(dst_width), // %3 "+r"(y1_fraction), // %4 "+r"(y0_fraction) // %5 : : "cc", "memory", "v0", "v1", "v3", "v4", "v5"); } // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr void ARGBBlendRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( "subs %w3, %w3, #8 \n" "b.lt 89f \n" // Blend 8 pixels. "8: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a "prfm pldl1keep, [%0, 448] \n" "umull v17.8h, v5.8b, v3.8b \n" // dg * a "prfm pldl1keep, [%1, 448] \n" "umull v18.8h, v6.8b, v3.8b \n" // dr * a "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) "uqadd v0.8b, v0.8b, v4.8b \n" // + sb "uqadd v1.8b, v1.8b, v5.8b \n" // + sg "uqadd v2.8b, v2.8b, v6.8b \n" // + sr "movi v3.8b, #255 \n" // a = 255 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB // pixels "b.ge 8b \n" "89: \n" "adds %w3, %w3, #8-1 \n" "b.lt 99f \n" // Blend 1 pixels. "1: \n" "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel // ARGB0. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel // ARGB1. "subs %w3, %w3, #1 \n" // 1 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a "prfm pldl1keep, [%0, 448] \n" "umull v17.8h, v5.8b, v3.8b \n" // dg * a "prfm pldl1keep, [%1, 448] \n" "umull v18.8h, v6.8b, v3.8b \n" // dr * a "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256) "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256) "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256) "uqadd v0.8b, v0.8b, v4.8b \n" // + sb "uqadd v1.8b, v1.8b, v5.8b \n" // + sg "uqadd v2.8b, v2.8b, v6.8b \n" // + sr "movi v3.8b, #255 \n" // a = 255 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. "b.ge 1b \n" "99: \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18"); } // Attenuate 8 pixels at a time. void ARGBAttenuateRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( // Attenuate 8 pixels. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v3.8b \n" // b * a "prfm pldl1keep, [%0, 448] \n" "umull v5.8h, v1.8b, v3.8b \n" // g * a "umull v6.8h, v2.8b, v3.8b \n" // r * a "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; void ARGBQuantizeRow_NEON(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width) { asm volatile( "dup v4.8h, %w2 \n" "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1 "dup v5.8h, %w3 \n" // interval multiply. "dup v6.8h, %w4 \n" // interval add // 8 pixel loop. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. "subs %w1, %w1, #8 \n" // 8 processed per loop. "uxtl v0.8h, v0.8b \n" // b (0 .. 255) "prfm pldl1keep, [%0, 448] \n" "uxtl v1.8h, v1.8b \n" "uxtl v2.8h, v2.8b \n" "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale "sqdmulh v1.8h, v1.8h, v4.8h \n" // g "sqdmulh v2.8h, v2.8h, v4.8h \n" // r "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size "mul v1.8h, v1.8h, v5.8h \n" // g "mul v2.8h, v2.8h, v5.8h \n" // r "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset "add v1.8h, v1.8h, v6.8h \n" // g "add v2.8h, v2.8h, v6.8h \n" // r "uqxtn v0.8b, v0.8h \n" "uqxtn v1.8b, v1.8h \n" "uqxtn v2.8b, v2.8h \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : "r"(scale), // %2 "r"(interval_size), // %3 "r"(interval_offset) // %4 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"); } // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. void ARGBShadeRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { asm volatile( "dup v0.4s, %w3 \n" // duplicate scale value. "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb. "ushr v0.8h, v0.8h, #1 \n" // scale / 2. // 8 pixel loop. "1: \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v4.8h, v4.8b \n" // b (0 .. 255) "prfm pldl1keep, [%0, 448] \n" "uxtl v5.8h, v5.8b \n" "uxtl v6.8h, v6.8b \n" "uxtl v7.8h, v7.8b \n" "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a "uqxtn v4.8b, v4.8h \n" "uqxtn v5.8b, v5.8h \n" "uqxtn v6.8b, v6.8h \n" "uqxtn v7.8b, v7.8h \n" "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(value) // %3 : "cc", "memory", "v0", "v4", "v5", "v6", "v7"); } // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. // C code is (29 * b + 150 * g + 77 * r + 128) >> 8; void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( "movi v24.8b, #29 \n" // B * 0.1140 coefficient "movi v25.8b, #150 \n" // G * 0.5870 coefficient "movi v26.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B "prfm pldl1keep, [%0, 448] \n" "umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B "orr v1.8b, v0.8b, v0.8b \n" // G "orr v2.8b, v0.8b, v0.8b \n" // R "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"); } // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. // b = (r * 35 + g * 68 + b * 17) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { asm volatile( "movi v20.8b, #17 \n" // BB coefficient "movi v21.8b, #68 \n" // BG coefficient "movi v22.8b, #35 \n" // BR coefficient "movi v24.8b, #22 \n" // GB coefficient "movi v25.8b, #88 \n" // GG coefficient "movi v26.8b, #45 \n" // GR coefficient "movi v28.8b, #24 \n" // BB coefficient "movi v29.8b, #98 \n" // BG coefficient "movi v30.8b, #50 \n" // BR coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. "subs %w1, %w1, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B "prfm pldl1keep, [%0, 448] \n" "umlal v4.8h, v1.8b, v21.8b \n" // G "umlal v4.8h, v2.8b, v22.8b \n" // R "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G "umlal v5.8h, v1.8b, v25.8b \n" // G "umlal v5.8h, v2.8b, v26.8b \n" // R "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R "umlal v6.8h, v1.8b, v29.8b \n" // G "umlal v6.8h, v2.8b, v30.8b \n" // R "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(width) // %1 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"); } // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { asm volatile( "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors. "sxtl v0.8h, v2.8b \n" // B,G coefficients s16. "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16. "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit "prfm pldl1keep, [%0, 448] \n" "uxtl v17.8h, v17.8b \n" // g "uxtl v18.8h, v18.8b \n" // r "uxtl v19.8h, v19.8b \n" // a "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"(matrix_argb) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v22", "v23", "v24", "v25"); } // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. void ARGBMultiplyRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // multiply B "prfm pldl1keep, [%0, 448] \n" "umull v1.8h, v1.8b, v5.8b \n" // multiply G "prfm pldl1keep, [%1, 448] \n" "umull v2.8h, v2.8b, v6.8b \n" // multiply R "umull v3.8h, v3.8b, v7.8b \n" // multiply A "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Add 2 rows of ARGB pixels together, 8 pixels at a time. void ARGBAddRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v4.8b \n" "prfm pldl1keep, [%0, 448] \n" "uqadd v1.8b, v1.8b, v5.8b \n" "prfm pldl1keep, [%1, 448] \n" "uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Subtract 2 rows of ARGB pixels, 8 pixels at a time. void ARGBSubtractRow_NEON(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { asm volatile( // 8 pixel loop. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqsub v0.8b, v0.8b, v4.8b \n" "prfm pldl1keep, [%0, 448] \n" "uqsub v1.8b, v1.8b, v5.8b \n" "prfm pldl1keep, [%1, 448] \n" "uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Adds Sobel X and Sobel Y and stores Sobel into ARGB. // A = 255 // R = Sobel // G = Sobel // B = Sobel void SobelRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { asm volatile( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v1.8b \n" // add "prfm pldl1keep, [%0, 448] \n" "orr v1.8b, v0.8b, v0.8b \n" "prfm pldl1keep, [%1, 448] \n" "orr v2.8b, v0.8b, v0.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3"); } // Adds Sobel X and Sobel Y and stores Sobel into plane. void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { asm volatile( // 16 pixel loop. "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. "subs %w3, %w3, #16 \n" // 16 processed per loop. "prfm pldl1keep, [%0, 448] \n" "uqadd v0.16b, v0.16b, v1.16b \n" // add "prfm pldl1keep, [%1, 448] \n" "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_y), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1"); } // Mixes Sobel X, Sobel Y and Sobel into ARGB. // A = 255 // R = Sobel X // G = Sobel // B = Sobel Y void SobelXYRow_NEON(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { asm volatile( "movi v3.8b, #255 \n" // alpha // 8 pixel loop. "1: \n" "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. "subs %w3, %w3, #8 \n" // 8 processed per loop. "prfm pldl1keep, [%0, 448] \n" "uqadd v1.8b, v0.8b, v2.8b \n" // add "prfm pldl1keep, [%1, 448] \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_sobelx), // %0 "+r"(src_sobely), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3"); } // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 void SobelXRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { asm volatile( "1: \n" "ld1 {v0.8b}, [%0],%5 \n" // top "ld1 {v1.8b}, [%0],%6 \n" "usubl v0.8h, v0.8b, v1.8b \n" "prfm pldl1keep, [%0, 448] \n" "ld1 {v2.8b}, [%1],%5 \n" // center * 2 "ld1 {v3.8b}, [%1],%6 \n" "usubl v1.8h, v2.8b, v3.8b \n" "prfm pldl1keep, [%1, 448] \n" "add v0.8h, v0.8h, v1.8h \n" "add v0.8h, v0.8h, v1.8h \n" "ld1 {v2.8b}, [%2],%5 \n" // bottom "ld1 {v3.8b}, [%2],%6 \n" "subs %w4, %w4, #8 \n" // 8 pixels "prfm pldl1keep, [%2, 448] \n" "usubl v1.8h, v2.8b, v3.8b \n" "add v0.8h, v0.8h, v1.8h \n" "abs v0.8h, v0.8h \n" "uqxtn v0.8b, v0.8h \n" "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx "b.gt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(src_y2), // %2 "+r"(dst_sobelx), // %3 "+r"(width) // %4 : "r"(2LL), // %5 "r"(6LL) // %6 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 void SobelYRow_NEON(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { asm volatile( "1: \n" "ld1 {v0.8b}, [%0],%4 \n" // left "ld1 {v1.8b}, [%1],%4 \n" "usubl v0.8h, v0.8b, v1.8b \n" "ld1 {v2.8b}, [%0],%4 \n" // center * 2 "ld1 {v3.8b}, [%1],%4 \n" "usubl v1.8h, v2.8b, v3.8b \n" "add v0.8h, v0.8h, v1.8h \n" "add v0.8h, v0.8h, v1.8h \n" "ld1 {v2.8b}, [%0],%5 \n" // right "ld1 {v3.8b}, [%1],%5 \n" "subs %w3, %w3, #8 \n" // 8 pixels "usubl v1.8h, v2.8b, v3.8b \n" "prfm pldl1keep, [%0, 448] \n" "add v0.8h, v0.8h, v1.8h \n" "prfm pldl1keep, [%1, 448] \n" "abs v0.8h, v0.8h \n" "uqxtn v0.8b, v0.8h \n" "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely "b.gt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 "+r"(width) // %3 : "r"(1LL), // %4 "r"(6LL) // %5 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } // Caveat - rounds float to half float whereas scaling version truncates. void HalfFloat1Row_NEON(const uint16_t* src, uint16_t* dst, float /*unused*/, int width) { asm volatile( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's "prfm pldl1keep, [%0, 448] \n" "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v3.4s, v3.4s \n" "fcvtn v1.4h, v2.4s \n" // 8 half floats "fcvtn2 v1.8h, v3.4s \n" "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : : "cc", "memory", "v1", "v2", "v3"); } void HalfFloatRow_NEON(const uint16_t* src, uint16_t* dst, float scale, int width) { asm volatile( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's "prfm pldl1keep, [%0, 448] \n" "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v3.4s, v3.4s \n" "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent "fmul v3.4s, v3.4s, %3.s[0] \n" "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat "uqshrn2 v1.8h, v3.4s, #13 \n" "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "w"(scale * 1.9259299444e-34f) // %3 : "cc", "memory", "v1", "v2", "v3"); } void ByteToFloatRow_NEON(const uint8_t* src, float* dst, float scale, int width) { asm volatile( "1: \n" "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v1.8h, v1.8b \n" // 8 shorts "prfm pldl1keep, [%0, 448] \n" "uxtl v2.4s, v1.4h \n" // 8 ints "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v3.4s, v3.4s \n" "fmul v2.4s, v2.4s, %3.s[0] \n" // scale "fmul v3.4s, v3.4s, %3.s[0] \n" "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "w"(scale) // %3 : "cc", "memory", "v1", "v2", "v3"); } float ScaleMaxSamples_NEON(const float* src, float* dst, float scale, int width) { float fmax; asm volatile( "movi v5.4s, #0 \n" // max "movi v6.4s, #0 \n" "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples "subs %w2, %w2, #8 \n" // 8 processed per loop "fmul v3.4s, v1.4s, %4.s[0] \n" // scale "prfm pldl1keep, [%0, 448] \n" "fmul v4.4s, v2.4s, %4.s[0] \n" // scale "fmax v5.4s, v5.4s, v1.4s \n" // max "fmax v6.4s, v6.4s, v2.4s \n" "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples "b.gt 1b \n" "fmax v5.4s, v5.4s, v6.4s \n" // max "fmaxv %s3, v5.4s \n" // signed max acculator : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 "=w"(fmax) // %3 : "w"(scale) // %4 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); return fmax; } float ScaleSumSamples_NEON(const float* src, float* dst, float scale, int width) { float fsum; asm volatile( "movi v5.4s, #0 \n" // max "movi v6.4s, #0 \n" // max "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples "subs %w2, %w2, #8 \n" // 8 processed per loop "fmul v3.4s, v1.4s, %4.s[0] \n" // scale "prfm pldl1keep, [%0, 448] \n" "fmul v4.4s, v2.4s, %4.s[0] \n" "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares "fmla v6.4s, v2.4s, v2.4s \n" "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples "b.gt 1b \n" "faddp v5.4s, v5.4s, v6.4s \n" "faddp v5.4s, v5.4s, v5.4s \n" "faddp %3.4s, v5.4s, v5.4s \n" // sum : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width), // %2 "=w"(fsum) // %3 : "w"(scale) // %4 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); return fsum; } void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) { asm volatile( "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop "fmul v1.4s, v1.4s, %3.s[0] \n" // scale "fmul v2.4s, v2.4s, %3.s[0] \n" // scale "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "w"(scale) // %3 : "cc", "memory", "v1", "v2"); } // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussCol_NEON(const uint16_t* src0, const uint16_t* src1, const uint16_t* src2, const uint16_t* src3, const uint16_t* src4, uint32_t* dst, int width) { asm volatile( "movi v6.8h, #4 \n" // constant 4 "movi v7.8h, #6 \n" // constant 6 "1: \n" "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows "ld1 {v2.8h}, [%4], #16 \n" "uaddl v0.4s, v1.4h, v2.4h \n" // * 1 "prfm pldl1keep, [%0, 448] \n" "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1 "ld1 {v2.8h}, [%1], #16 \n" "umlal v0.4s, v2.4h, v6.4h \n" // * 4 "prfm pldl1keep, [%1, 448] \n" "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 "ld1 {v2.8h}, [%2], #16 \n" "umlal v0.4s, v2.4h, v7.4h \n" // * 6 "prfm pldl1keep, [%2, 448] \n" "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6 "ld1 {v2.8h}, [%3], #16 \n" "umlal v0.4s, v2.4h, v6.4h \n" // * 4 "prfm pldl1keep, [%3, 448] \n" "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4 "subs %w6, %w6, #8 \n" // 8 processed per loop "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples "prfm pldl1keep, [%4, 448] \n" "b.gt 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(src2), // %2 "+r"(src3), // %3 "+r"(src4), // %4 "+r"(dst), // %5 "+r"(width) // %6 : : "cc", "memory", "v0", "v1", "v2", "v6", "v7"); } // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { const uint32_t* src1 = src + 1; const uint32_t* src2 = src + 2; const uint32_t* src3 = src + 3; asm volatile( "movi v6.4s, #4 \n" // constant 4 "movi v7.4s, #6 \n" // constant 6 "1: \n" "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples "add v0.4s, v0.4s, v1.4s \n" // * 1 "add v1.4s, v1.4s, v2.4s \n" // * 1 "ld1 {v2.4s,v3.4s}, [%2], #32 \n" "mla v0.4s, v2.4s, v7.4s \n" // * 6 "mla v1.4s, v3.4s, v7.4s \n" // * 6 "ld1 {v2.4s,v3.4s}, [%1], #32 \n" "ld1 {v4.4s,v5.4s}, [%3], #32 \n" "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4 "add v3.4s, v3.4s, v5.4s \n" "prfm pldl1keep, [%0, 448] \n" "mla v0.4s, v2.4s, v6.4s \n" // * 4 "mla v1.4s, v3.4s, v6.4s \n" // * 4 "subs %w5, %w5, #8 \n" // 8 processed per loop "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack "uqrshrn2 v0.8h, v1.4s, #8 \n" "st1 {v0.8h}, [%4], #16 \n" // store 8 samples "b.gt 1b \n" : "+r"(src), // %0 "+r"(src1), // %1 "+r"(src2), // %2 "+r"(src3), // %3 "+r"(dst), // %4 "+r"(width) // %5 : "r"(32LL) // %6 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f}; // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussCol_F32_NEON(const float* src0, const float* src1, const float* src2, const float* src3, const float* src4, float* dst, int width) { asm volatile( "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6 "1: \n" "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows "ld1 {v2.4s, v3.4s}, [%1], #32 \n" "fmla v0.4s, v2.4s, v6.4s \n" // * 4 "ld1 {v4.4s, v5.4s}, [%2], #32 \n" "fmla v1.4s, v3.4s, v6.4s \n" "prfm pldl1keep, [%0, 448] \n" "fmla v0.4s, v4.4s, v7.4s \n" // * 6 "ld1 {v2.4s, v3.4s}, [%3], #32 \n" "fmla v1.4s, v5.4s, v7.4s \n" "prfm pldl1keep, [%1, 448] \n" "fmla v0.4s, v2.4s, v6.4s \n" // * 4 "ld1 {v4.4s, v5.4s}, [%4], #32 \n" "fmla v1.4s, v3.4s, v6.4s \n" "prfm pldl1keep, [%2, 448] \n" "fadd v0.4s, v0.4s, v4.4s \n" // * 1 "prfm pldl1keep, [%3, 448] \n" "fadd v1.4s, v1.4s, v5.4s \n" "prfm pldl1keep, [%4, 448] \n" "subs %w6, %w6, #8 \n" // 8 processed per loop "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples "b.gt 1b \n" : "+r"(src0), // %0 "+r"(src1), // %1 "+r"(src2), // %2 "+r"(src3), // %3 "+r"(src4), // %4 "+r"(dst), // %5 "+r"(width) // %6 : "r"(&kGaussCoefficients) // %7 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. void GaussRow_F32_NEON(const float* src, float* dst, int width) { asm volatile( "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 "1: \n" "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5 // rows "fadd v0.4s, v0.4s, v1.4s \n" // * 1 "ld1 {v4.4s, v5.4s}, [%0], %5 \n" "fadd v1.4s, v1.4s, v2.4s \n" "fmla v0.4s, v4.4s, v7.4s \n" // * 6 "ld1 {v2.4s, v3.4s}, [%0], %4 \n" "fmla v1.4s, v5.4s, v7.4s \n" "ld1 {v4.4s, v5.4s}, [%0], %6 \n" "fadd v2.4s, v2.4s, v4.4s \n" "fadd v3.4s, v3.4s, v5.4s \n" "fmla v0.4s, v2.4s, v6.4s \n" // * 4 "fmla v1.4s, v3.4s, v6.4s \n" "prfm pldl1keep, [%0, 448] \n" "fmul v0.4s, v0.4s, v8.4s \n" // / 256 "fmul v1.4s, v1.4s, v8.4s \n" "subs %w2, %w2, #8 \n" // 8 processed per loop "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 : "r"(&kGaussCoefficients), // %3 "r"(8LL), // %4 "r"(-4LL), // %5 "r"(20LL) // %6 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); } #if LIBYUV_USE_ST3 // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { asm volatile( "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values "prfm pldl1keep, [%0, 448] \n" "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" // 16 pixels per loop "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2"); } #else static const uvec8 kYUV24Shuffle[3] = { {16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20}, {21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27}, {10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}}; // Convert biplanar NV21 to packed YUV24 // NV21 has VU in memory for chroma. // YUV24 is VUY in memory void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { asm volatile( "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values "tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24 "prfm pldl1keep, [%0, 448] \n" "tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n" "prfm pldl1keep, [%1, 448] \n" "tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n" "subs %w3, %w3, #16 \n" // 16 pixels per loop "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_vu), // %1 "+r"(dst_yuv24), // %2 "+r"(width) // %3 : "r"(&kYUV24Shuffle[0]) // %4 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } #endif // LIBYUV_USE_ST3 // Note ST2 8b version is faster than zip+ST1 // AYUV is VUYA in memory. UV for NV12 is UV order in memory. void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, int width) { const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v2.8b, v1.8h, #2 \n" "subs %w3, %w3, #16 \n" // 16 processed per loop. "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV. "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_ayuv_1), // %1 "+r"(dst_uv), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } void AYUVToVURow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_vu, int width) { const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v1.8b, v1.8h, #2 \n" "subs %w3, %w3, #16 \n" // 16 processed per loop. "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(src_ayuv_1), // %1 "+r"(dst_vu), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // Copy row of AYUV Y's into Y void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 "subs %w2, %w2, #16 \n" // 16 pixels per loop "prfm pldl1keep, [%0, 448] \n" "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels "b.gt 1b \n" : "+r"(src_ayuv), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3"); } // Shuffle table for swapping UV bytes. static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { asm volatile( "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values "ld1 {v1.16b}, [%0], 16 \n" "subs %w2, %w2, #16 \n" // 16 pixels per loop "tbl v0.16b, {v0.16b}, v2.16b \n" "prfm pldl1keep, [%0, 448] \n" "tbl v1.16b, {v1.16b}, v2.16b \n" "stp q0, q1, [%1], 32 \n" // store 16 VU pixels "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 : "r"(&kShuffleSwapUV) // %3 : "cc", "memory", "v0", "v1", "v2"); } void HalfMergeUVRow_NEON(const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint8_t* dst_uv, int width) { const uint8_t* src_u_1 = src_u + src_stride_u; const uint8_t* src_v_1 = src_v + src_stride_v; asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values "ld1 {v2.16b}, [%1], #16 \n" "ld1 {v3.16b}, [%3], #16 \n" "uaddlp v0.8h, v0.16b \n" // half size "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" "prfm pldl1keep, [%2, 448] \n" "uadalp v0.8h, v2.16b \n" "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v3.16b \n" "prfm pldl1keep, [%3, 448] \n" "uqrshrn v0.8b, v0.8h, #2 \n" "uqrshrn v1.8b, v1.8h, #2 \n" "subs %w5, %w5, #16 \n" // 16 src pixels per loop "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_u_1), // %1 "+r"(src_v), // %2 "+r"(src_v_1), // %3 "+r"(dst_uv), // %4 "+r"(width) // %5 : : "cc", "memory", "v0", "v1", "v2", "v3"); } void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, int depth, int width) { int shift = depth - 16; // Negative for right shift. asm volatile( "dup v2.8h, %w4 \n" "1: \n" "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV "subs %w3, %w3, #8 \n" // 8 src pixels per loop "ushl v0.8h, v0.8h, v2.8h \n" "prfm pldl1keep, [%0, 448] \n" "ushl v1.8h, v1.8h, v2.8h \n" "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(width) // %3 : "r"(shift) // %4 : "cc", "memory", "v0", "v1", "v2"); } void MultiplyRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { asm volatile( "dup v2.8h, %w2 \n" "1: \n" "ldp q0, q1, [%0], #32 \n" "mul v0.8h, v0.8h, v2.8h \n" "prfm pldl1keep, [%0, 448] \n" "mul v1.8h, v1.8h, v2.8h \n" "stp q0, q1, [%1] \n" // store 16 pixels "add %1, %1, #32 \n" "subs %w3, %w3, #16 \n" // 16 src pixels per loop "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(scale), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2"); } void DivideRow_16_NEON(const uint16_t* src_y, uint16_t* dst_y, int scale, int width) { asm volatile( "dup v0.8h, %w2 \n" "1: \n" "ldp q1, q2, [%0], #32 \n" "ushll v3.4s, v1.4h, #0 \n" "ushll v4.4s, v2.4h, #0 \n" "prfm pldl1keep, [%0, 448] \n" "ushll2 v1.4s, v1.8h, #0 \n" "ushll2 v2.4s, v2.8h, #0 \n" "mul v3.4s, v0.4s, v3.4s \n" "mul v4.4s, v0.4s, v4.4s \n" "mul v1.4s, v0.4s, v1.4s \n" "mul v2.4s, v0.4s, v2.4s \n" "shrn v3.4h, v3.4s, #16 \n" "shrn v4.4h, v4.4s, #16 \n" "shrn2 v3.8h, v1.4s, #16 \n" "shrn2 v4.8h, v2.4s, #16 \n" "stp q3, q3, [%1] \n" // store 16 pixels "add %1, %1, #32 \n" "subs %w3, %w3, #16 \n" // 16 src pixels per loop "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(scale), // %2 "+r"(width) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/row_win.cc000066400000000000000000006413741416500237200210400ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/row.h" // This module is for Visual C 32/64 bit #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) #if defined(_M_X64) #include #include // For _mm_maddubs_epi16 #endif #ifdef __cplusplus namespace libyuv { extern "C" { #endif // 64 bit #if defined(_M_X64) // Read 8 UV from 444 #define READYUV444 \ xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ u_buf += 8; \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ y_buf += 8; // Read 8 UV from 444, With 8 Alpha. #define READYUVA444 \ xmm3 = _mm_loadl_epi64((__m128i*)u_buf); \ xmm1 = _mm_loadl_epi64((__m128i*)(u_buf + offset)); \ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ u_buf += 8; \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ y_buf += 8; \ xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ a_buf += 8; // Read 4 UV from 422, upsample to 8 UV. #define READYUV422 \ xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ u_buf += 4; \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ y_buf += 8; // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ xmm3 = _mm_cvtsi32_si128(*(uint32_t*)u_buf); \ xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \ xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ xmm3 = _mm_unpacklo_epi16(xmm3, xmm3); \ u_buf += 4; \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ y_buf += 8; \ xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ a_buf += 8; // Convert 8 pixels: 8 UV and 8 Y. #define YUVTORGB(yuvconstants) \ xmm3 = _mm_sub_epi8(xmm3, _mm_set1_epi8(0x80)); \ xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ xmm4 = _mm_add_epi16(xmm4, *(__m128i*)yuvconstants->kYBiasToRgb); \ xmm0 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToB, xmm3); \ xmm1 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToG, xmm3); \ xmm2 = _mm_maddubs_epi16(*(__m128i*)yuvconstants->kUVToR, xmm3); \ xmm0 = _mm_adds_epi16(xmm4, xmm0); \ xmm1 = _mm_subs_epi16(xmm4, xmm1); \ xmm2 = _mm_adds_epi16(xmm4, xmm2); \ xmm0 = _mm_srai_epi16(xmm0, 6); \ xmm1 = _mm_srai_epi16(xmm1, 6); \ xmm2 = _mm_srai_epi16(xmm2, 6); \ xmm0 = _mm_packus_epi16(xmm0, xmm0); \ xmm1 = _mm_packus_epi16(xmm1, xmm1); \ xmm2 = _mm_packus_epi16(xmm2, xmm2); // Store 8 ARGB values. #define STOREARGB \ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ xmm1 = _mm_loadu_si128(&xmm0); \ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ _mm_storeu_si128((__m128i*)dst_argb, xmm0); \ _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ dst_argb += 32; #if defined(HAS_I422TOARGBROW_SSSE3) void I422ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4; const __m128i xmm5 = _mm_set1_epi8(-1); const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUV422 YUVTORGB(yuvconstants) STOREARGB width -= 8; } } #endif #if defined(HAS_I422ALPHATOARGBROW_SSSE3) void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUVA422 YUVTORGB(yuvconstants) STOREARGB width -= 8; } } #endif #if defined(HAS_I444TOARGBROW_SSSE3) void I444ToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4; const __m128i xmm5 = _mm_set1_epi8(-1); const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUV444 YUVTORGB(yuvconstants) STOREARGB width -= 8; } } #endif #if defined(HAS_I444ALPHATOARGBROW_SSSE3) void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf; while (width > 0) { READYUVA444 YUVTORGB(yuvconstants) STOREARGB width -= 8; } } #endif // 32 bit #else // defined(_M_X64) #ifdef HAS_ARGBTOYROW_SSSE3 // Constants for ARGB. static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0}; // JPeg full range. static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0}; static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0}; static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0}; static const vec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, }; static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0}; // vpshufb for vphaddw + vpackuswb packed to shorts. static const lvec8 kShufARGBToUV_AVX = { 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; // Constants for BGRA. static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13}; static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112}; static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18}; // Constants for ABGR. static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0}; static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0}; static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0}; // Constants for RGBA. static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33}; static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38}; static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112}; static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; // 7 bit fixed point 0.5. static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; // 8 bit fixed point 0.5, for bias of UV. static const ulvec8 kBiasUV128 = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Shuffle table for converting RGB24 to ARGB. static const uvec8 kShuffleMaskRGB24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; // Shuffle table for converting RAW to ARGB. static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; // Shuffle table for converting RAW to RGB24. First 8. static const uvec8 kShuffleMaskRAWToRGB24_0 = { 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Middle 8. static const uvec8 kShuffleMaskRAWToRGB24_1 = { 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting RAW to RGB24. Last 8. static const uvec8 kShuffleMaskRAWToRGB24_2 = { 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RGB24. static const uvec8 kShuffleMaskARGBToRGB24 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGB to RAW. static const uvec8 kShuffleMaskARGBToRAW = { 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 static const uvec8 kShuffleMaskARGBToRGB24_0 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; // YUY2 shuf 16 Y to 32 Y. static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; // YUY2 shuf 8 UV to 16 UV. static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; // UYVY shuf 16 Y to 32 Y. static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; // UYVY shuf 8 UV to 16 UV. static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; // NV21 shuf 8 VU to 16 UV. static const lvec8 kShuffleNV21 = { 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, }; // Duplicates gray value 3 times and fills in alpha opaque. __declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_y mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 convertloop: movq xmm0, qword ptr [eax] lea eax, [eax + 8] punpcklbw xmm0, xmm0 movdqa xmm1, xmm0 punpcklwd xmm0, xmm0 punpckhwd xmm1, xmm1 por xmm0, xmm5 por xmm1, xmm5 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop ret } } #ifdef HAS_J400TOARGBROW_AVX2 // Duplicates gray value 3 times and fills in alpha opaque. __declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_y mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 convertloop: vmovdqu xmm0, [eax] lea eax, [eax + 16] vpermq ymm0, ymm0, 0xd8 vpunpcklbw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 vpunpckhwd ymm1, ymm0, ymm0 vpunpcklwd ymm0, ymm0, ymm0 vpor ymm0, ymm0, ymm5 vpor ymm1, ymm1, ymm5 vmovdqu [edx], ymm0 vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] sub ecx, 16 jg convertloop vzeroupper ret } } #endif // HAS_J400TOARGBROW_AVX2 __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_rgb24 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm3, [eax + 32] lea eax, [eax + 48] movdqa xmm2, xmm3 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} pshufb xmm2, xmm4 por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 movdqu [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 movdqu [edx + 16], xmm1 por xmm3, xmm5 movdqu [edx + 48], xmm3 lea edx, [edx + 64] sub ecx, 16 jg convertloop ret } } __declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_raw mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width pcmpeqb xmm5, xmm5 // generate mask 0xff000000 pslld xmm5, 24 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm3, [eax + 32] lea eax, [eax + 48] movdqa xmm2, xmm3 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} pshufb xmm2, xmm4 por xmm2, xmm5 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} pshufb xmm0, xmm4 movdqu [edx + 32], xmm2 por xmm0, xmm5 pshufb xmm1, xmm4 movdqu [edx], xmm0 por xmm1, xmm5 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} pshufb xmm3, xmm4 movdqu [edx + 16], xmm1 por xmm3, xmm5 movdqu [edx + 48], xmm3 lea edx, [edx + 64] sub ecx, 16 jg convertloop ret } } __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { __asm { mov eax, [esp + 4] // src_raw mov edx, [esp + 8] // dst_rgb24 mov ecx, [esp + 12] // width movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 4] movdqu xmm2, [eax + 8] lea eax, [eax + 24] pshufb xmm0, xmm3 pshufb xmm1, xmm4 pshufb xmm2, xmm5 movq qword ptr [edx], xmm0 movq qword ptr [edx + 8], xmm1 movq qword ptr [edx + 16], xmm2 lea edx, [edx + 24] sub ecx, 8 jg convertloop ret } } // pmul method to replicate bits. // Math to replicate bits: // (v << 8) | (v << 3) // v * 256 + v * 8 // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 // 20 instructions. __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits movd xmm5, eax pshufd xmm5, xmm5, 0 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits movd xmm6, eax pshufd xmm6, xmm6, 0 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red psllw xmm3, 11 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green psllw xmm4, 10 psrlw xmm4, 5 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha psllw xmm7, 8 mov eax, [esp + 4] // src_rgb565 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: movdqu xmm0, [eax] // fetch 8 pixels of bgr565 movdqa xmm1, xmm0 movdqa xmm2, xmm0 pand xmm1, xmm3 // R in upper 5 bits psllw xmm2, 11 // B in upper 5 bits pmulhuw xmm1, xmm5 // * (256 + 8) pmulhuw xmm2, xmm5 // * (256 + 8) psllw xmm1, 8 por xmm1, xmm2 // RB pand xmm0, xmm4 // G in middle 6 bits pmulhuw xmm0, xmm6 // << 5 * (256 + 4) por xmm0, xmm7 // AG movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop ret } } #ifdef HAS_RGB565TOARGBROW_AVX2 // pmul method to replicate bits. // Math to replicate bits: // (v << 8) | (v << 3) // v * 256 + v * 8 // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits vmovd xmm5, eax vbroadcastss ymm5, xmm5 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits vmovd xmm6, eax vbroadcastss ymm6, xmm6 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green vpsllw ymm4, ymm4, 10 vpsrlw ymm4, ymm4, 5 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha vpsllw ymm7, ymm7, 8 mov eax, [esp + 4] // src_rgb565 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 vpand ymm1, ymm0, ymm3 // R in upper 5 bits vpsllw ymm2, ymm0, 11 // B in upper 5 bits vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) vpsllw ymm1, ymm1, 8 vpor ymm1, ymm1, ymm2 // RB vpand ymm0, ymm0, ymm4 // G in middle 6 bits vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) vpor ymm0, ymm0, ymm7 // AG vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm1, ymm1, 0xd8 vpunpckhbw ymm2, ymm1, ymm0 vpunpcklbw ymm1, ymm1, ymm0 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB lea eax, [eax + 32] sub ecx, 16 jg convertloop vzeroupper ret } } #endif // HAS_RGB565TOARGBROW_AVX2 #ifdef HAS_ARGB1555TOARGBROW_AVX2 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits vmovd xmm5, eax vbroadcastss ymm5, xmm5 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits vmovd xmm6, eax vbroadcastss ymm6, xmm6 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red vpsllw ymm3, ymm3, 11 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha vpsllw ymm7, ymm7, 8 mov eax, [esp + 4] // src_argb1555 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 vpsllw ymm1, ymm0, 1 // R in upper 5 bits vpsllw ymm2, ymm0, 11 // B in upper 5 bits vpand ymm1, ymm1, ymm3 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) vpsllw ymm1, ymm1, 8 vpor ymm1, ymm1, ymm2 // RB vpsraw ymm2, ymm0, 8 // A vpand ymm0, ymm0, ymm4 // G in middle 5 bits vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) vpand ymm2, ymm2, ymm7 vpor ymm0, ymm0, ymm2 // AG vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm1, ymm1, 0xd8 vpunpckhbw ymm2, ymm1, ymm0 vpunpcklbw ymm1, ymm1, ymm0 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB lea eax, [eax + 32] sub ecx, 16 jg convertloop vzeroupper ret } } #endif // HAS_ARGB1555TOARGBROW_AVX2 #ifdef HAS_ARGB4444TOARGBROW_AVX2 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f vmovd xmm4, eax vbroadcastss ymm4, xmm4 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles mov eax, [esp + 4] // src_argb4444 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 vpand ymm2, ymm0, ymm5 // mask high nibbles vpand ymm0, ymm0, ymm4 // mask low nibbles vpsrlw ymm3, ymm2, 4 vpsllw ymm1, ymm0, 4 vpor ymm2, ymm2, ymm3 vpor ymm0, ymm0, ymm1 vpermq ymm0, ymm0, 0xd8 // mutate for unpack vpermq ymm2, ymm2, 0xd8 vpunpckhbw ymm1, ymm0, ymm2 vpunpcklbw ymm0, ymm0, ymm2 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB lea eax, [eax + 32] sub ecx, 16 jg convertloop vzeroupper ret } } #endif // HAS_ARGB4444TOARGBROW_AVX2 // 24 instructions __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { __asm { mov eax, 0x01080108 // generate multiplier to repeat 5 bits movd xmm5, eax pshufd xmm5, xmm5, 0 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits movd xmm6, eax pshufd xmm6, xmm6, 0 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red psllw xmm3, 11 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green psrlw xmm4, 6 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha psllw xmm7, 8 mov eax, [esp + 4] // src_argb1555 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: movdqu xmm0, [eax] // fetch 8 pixels of 1555 movdqa xmm1, xmm0 movdqa xmm2, xmm0 psllw xmm1, 1 // R in upper 5 bits psllw xmm2, 11 // B in upper 5 bits pand xmm1, xmm3 pmulhuw xmm2, xmm5 // * (256 + 8) pmulhuw xmm1, xmm5 // * (256 + 8) psllw xmm1, 8 por xmm1, xmm2 // RB movdqa xmm2, xmm0 pand xmm0, xmm4 // G in middle 5 bits psraw xmm2, 8 // A pmulhuw xmm0, xmm6 // << 6 * (256 + 8) pand xmm2, xmm7 por xmm0, xmm2 // AG movdqa xmm2, xmm1 punpcklbw xmm1, xmm0 punpckhbw xmm2, xmm0 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop ret } } // 18 instructions. __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { __asm { mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f movd xmm4, eax pshufd xmm4, xmm4, 0 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles pslld xmm5, 4 mov eax, [esp + 4] // src_argb4444 mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax sub edx, eax convertloop: movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 movdqa xmm2, xmm0 pand xmm0, xmm4 // mask low nibbles pand xmm2, xmm5 // mask high nibbles movdqa xmm1, xmm0 movdqa xmm3, xmm2 psllw xmm1, 4 psrlw xmm3, 4 por xmm0, xmm1 por xmm2, xmm3 movdqa xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB lea eax, [eax + 16] sub ecx, 8 jg convertloop ret } } __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 convertloop: movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] lea eax, [eax + 64] pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 pshufb xmm2, xmm6 pshufb xmm3, xmm6 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 psrldq xmm1, 4 // 8 bytes from 1 pslldq xmm4, 12 // 4 bytes from 1 for 0 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 por xmm0, xmm4 // 4 bytes from 1 for 0 pslldq xmm5, 8 // 8 bytes from 2 for 1 movdqu [edx], xmm0 // store 0 por xmm1, xmm5 // 8 bytes from 2 for 1 psrldq xmm2, 8 // 4 bytes from 2 pslldq xmm3, 4 // 12 bytes from 3 for 2 por xmm2, xmm3 // 12 bytes from 3 for 2 movdqu [edx + 16], xmm1 // store 1 movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop ret } } __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW convertloop: movdqu xmm0, [eax] // fetch 16 pixels of argb movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] lea eax, [eax + 64] pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB pshufb xmm1, xmm6 pshufb xmm2, xmm6 pshufb xmm3, xmm6 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 psrldq xmm1, 4 // 8 bytes from 1 pslldq xmm4, 12 // 4 bytes from 1 for 0 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 por xmm0, xmm4 // 4 bytes from 1 for 0 pslldq xmm5, 8 // 8 bytes from 2 for 1 movdqu [edx], xmm0 // store 0 por xmm1, xmm5 // 8 bytes from 2 for 1 psrldq xmm2, 8 // 4 bytes from 2 pslldq xmm3, 4 // 12 bytes from 3 for 2 por xmm2, xmm3 // 12 bytes from 3 for 2 movdqu [edx + 16], xmm1 // store 1 movdqu [edx + 32], xmm2 // store 2 lea edx, [edx + 48] sub ecx, 16 jg convertloop ret } } __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width pcmpeqb xmm3, xmm3 // generate mask 0x0000001f psrld xmm3, 27 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 psrld xmm4, 26 pslld xmm4, 5 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pslld xmm5, 11 convertloop: movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B movdqa xmm2, xmm0 // G pslld xmm0, 8 // R psrld xmm1, 3 // B psrld xmm2, 5 // G psrad xmm0, 16 // R pand xmm1, xmm3 // B pand xmm2, xmm4 // G pand xmm0, xmm5 // R por xmm1, xmm2 // BG por xmm0, xmm1 // BGR packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 lea edx, [edx + 8] sub ecx, 4 jg convertloop ret } } __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb movd xmm6, [esp + 12] // dither4 mov ecx, [esp + 16] // width punpcklbw xmm6, xmm6 // make dither 16 bytes movdqa xmm7, xmm6 punpcklwd xmm6, xmm6 punpckhwd xmm7, xmm7 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f psrld xmm3, 27 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 psrld xmm4, 26 pslld xmm4, 5 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 pslld xmm5, 11 convertloop: movdqu xmm0, [eax] // fetch 4 pixels of argb paddusb xmm0, xmm6 // add dither movdqa xmm1, xmm0 // B movdqa xmm2, xmm0 // G pslld xmm0, 8 // R psrld xmm1, 3 // B psrld xmm2, 5 // G psrad xmm0, 16 // R pand xmm1, xmm3 // B pand xmm2, xmm4 // G pand xmm0, xmm5 // R por xmm1, xmm2 // BG por xmm0, xmm1 // BGR packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 lea edx, [edx + 8] sub ecx, 4 jg convertloop ret } } #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, const uint32_t dither4, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb vbroadcastss xmm6, [esp + 12] // dither4 mov ecx, [esp + 16] // width vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes vpermq ymm6, ymm6, 0xd8 vpunpcklwd ymm6, ymm6, ymm6 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f vpsrld ymm3, ymm3, 27 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 vpsrld ymm4, ymm4, 26 vpslld ymm4, ymm4, 5 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 convertloop: vmovdqu ymm0, [eax] // fetch 8 pixels of argb vpaddusb ymm0, ymm0, ymm6 // add dither vpsrld ymm2, ymm0, 5 // G vpsrld ymm1, ymm0, 3 // B vpsrld ymm0, ymm0, 8 // R vpand ymm2, ymm2, ymm4 // G vpand ymm1, ymm1, ymm3 // B vpand ymm0, ymm0, ymm5 // R vpor ymm1, ymm1, ymm2 // BG vpor ymm0, ymm0, ymm1 // BGR vpackusdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] vmovdqu [edx], xmm0 // store 8 pixels of RGB565 lea edx, [edx + 16] sub ecx, 8 jg convertloop vzeroupper ret } } #endif // HAS_ARGBTORGB565DITHERROW_AVX2 // TODO(fbarchard): Improve sign extension/packing. __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width pcmpeqb xmm4, xmm4 // generate mask 0x0000001f psrld xmm4, 27 movdqa xmm5, xmm4 // generate mask 0x000003e0 pslld xmm5, 5 movdqa xmm6, xmm4 // generate mask 0x00007c00 pslld xmm6, 10 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 pslld xmm7, 15 convertloop: movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 // B movdqa xmm2, xmm0 // G movdqa xmm3, xmm0 // R psrad xmm0, 16 // A psrld xmm1, 3 // B psrld xmm2, 6 // G psrld xmm3, 9 // R pand xmm0, xmm7 // A pand xmm1, xmm4 // B pand xmm2, xmm5 // G pand xmm3, xmm6 // R por xmm0, xmm1 // BA por xmm2, xmm3 // GR por xmm0, xmm2 // BGRA packssdw xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 lea edx, [edx + 8] sub ecx, 4 jg convertloop ret } } __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 psllw xmm4, 12 movdqa xmm3, xmm4 // generate mask 0x00f000f0 psrlw xmm3, 8 convertloop: movdqu xmm0, [eax] // fetch 4 pixels of argb movdqa xmm1, xmm0 pand xmm0, xmm3 // low nibble pand xmm1, xmm4 // high nibble psrld xmm0, 4 psrld xmm1, 8 por xmm0, xmm1 packuswb xmm0, xmm0 lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 lea edx, [edx + 8] sub ecx, 4 jg convertloop ret } } #ifdef HAS_ARGBTORGB565ROW_AVX2 __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f vpsrld ymm3, ymm3, 27 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 vpsrld ymm4, ymm4, 26 vpslld ymm4, ymm4, 5 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 convertloop: vmovdqu ymm0, [eax] // fetch 8 pixels of argb vpsrld ymm2, ymm0, 5 // G vpsrld ymm1, ymm0, 3 // B vpsrld ymm0, ymm0, 8 // R vpand ymm2, ymm2, ymm4 // G vpand ymm1, ymm1, ymm3 // B vpand ymm0, ymm0, ymm5 // R vpor ymm1, ymm1, ymm2 // BG vpor ymm0, ymm0, ymm1 // BGR vpackusdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] vmovdqu [edx], xmm0 // store 8 pixels of RGB565 lea edx, [edx + 16] sub ecx, 8 jg convertloop vzeroupper ret } } #endif // HAS_ARGBTORGB565ROW_AVX2 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width vpcmpeqb ymm4, ymm4, ymm4 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 vpslld ymm7, ymm7, 15 convertloop: vmovdqu ymm0, [eax] // fetch 8 pixels of argb vpsrld ymm3, ymm0, 9 // R vpsrld ymm2, ymm0, 6 // G vpsrld ymm1, ymm0, 3 // B vpsrad ymm0, ymm0, 16 // A vpand ymm3, ymm3, ymm6 // R vpand ymm2, ymm2, ymm5 // G vpand ymm1, ymm1, ymm4 // B vpand ymm0, ymm0, ymm7 // A vpor ymm0, ymm0, ymm1 // BA vpor ymm2, ymm2, ymm3 // GR vpor ymm0, ymm0, ymm2 // BGRA vpackssdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 lea edx, [edx + 16] sub ecx, 8 jg convertloop vzeroupper ret } } #endif // HAS_ARGBTOARGB1555ROW_AVX2 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // width vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 vpsllw ymm4, ymm4, 12 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 convertloop: vmovdqu ymm0, [eax] // fetch 8 pixels of argb vpand ymm1, ymm0, ymm4 // high nibble vpand ymm0, ymm0, ymm3 // low nibble vpsrld ymm1, ymm1, 8 vpsrld ymm0, ymm0, 4 vpor ymm0, ymm0, ymm1 vpackuswb ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 lea edx, [edx + 16] sub ecx, 8 jg convertloop vzeroupper ret } } #endif // HAS_ARGBTOARGB4444ROW_AVX2 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. __declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToY movdqa xmm5, xmmword ptr kAddY16 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg convertloop ret } } // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. __declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToYJ movdqa xmm5, xmmword ptr kAddYJ64 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 paddw xmm0, xmm5 // Add .5 for rounding. paddw xmm2, xmm5 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg convertloop ret } } #ifdef HAS_ARGBTOYROW_AVX2 // vpermd for vphaddw + vpackuswb vpermd. static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. __declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* width */ vbroadcastf128 ymm4, xmmword ptr kARGBToY vbroadcastf128 ymm5, xmmword ptr kAddY16 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + 64] vmovdqu ymm3, [eax + 96] vpmaddubsw ymm0, ymm0, ymm4 vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 lea eax, [eax + 128] vphaddw ymm0, ymm0, ymm1 // mutates. vphaddw ymm2, ymm2, ymm3 vpsrlw ymm0, ymm0, 7 vpsrlw ymm2, ymm2, 7 vpackuswb ymm0, ymm0, ymm2 // mutates. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. vpaddb ymm0, ymm0, ymm5 // add 16 for Y vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 jg convertloop vzeroupper ret } } #endif // HAS_ARGBTOYROW_AVX2 #ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. __declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* width */ vbroadcastf128 ymm4, xmmword ptr kARGBToYJ vbroadcastf128 ymm5, xmmword ptr kAddYJ64 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + 64] vmovdqu ymm3, [eax + 96] vpmaddubsw ymm0, ymm0, ymm4 vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 lea eax, [eax + 128] vphaddw ymm0, ymm0, ymm1 // mutates. vphaddw ymm2, ymm2, ymm3 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. vpaddw ymm2, ymm2, ymm5 vpsrlw ymm0, ymm0, 7 vpsrlw ymm2, ymm2, 7 vpackuswb ymm0, ymm0, ymm2 // mutates. vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 jg convertloop vzeroupper ret } } #endif // HAS_ARGBTOYJROW_AVX2 __declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kBGRAToY movdqa xmm5, xmmword ptr kAddY16 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg convertloop ret } } __declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kABGRToY movdqa xmm5, xmmword ptr kAddY16 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg convertloop ret } } __declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kRGBAToY movdqa xmm5, xmmword ptr kAddY16 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 paddb xmm0, xmm5 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg convertloop ret } } __declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_argb mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 movdqu xmm1, [eax + 16] movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 movdqu xmm2, [eax + 32] movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 movdqu xmm3, [eax + 48] movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm4, xmm1, 0xdd pavgb xmm0, xmm4 movdqa xmm4, xmm2 shufps xmm2, xmm3, 0x88 shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U pmaddubsw xmm2, xmm7 pmaddubsw xmm1, xmm6 // V pmaddubsw xmm3, xmm6 phaddw xmm0, xmm2 phaddw xmm1, xmm3 psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop pop edi pop esi ret } } __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_argb mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToVJ movdqa xmm7, xmmword ptr kARGBToUJ sub edi, edx // stride from u to v convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 movdqu xmm1, [eax + 16] movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 movdqu xmm2, [eax + 32] movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 movdqu xmm3, [eax + 48] movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm4, xmm1, 0xdd pavgb xmm0, xmm4 movdqa xmm4, xmm2 shufps xmm2, xmm3, 0x88 shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U pmaddubsw xmm2, xmm7 pmaddubsw xmm1, xmm6 // V pmaddubsw xmm3, xmm6 phaddw xmm0, xmm2 phaddw xmm1, xmm3 paddw xmm0, xmm5 // +.5 rounding -> unsigned paddw xmm1, xmm5 psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop pop edi pop esi ret } } #ifdef HAS_ARGBTOUVROW_AVX2 __declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_argb mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width vbroadcastf128 ymm5, xmmword ptr kBiasUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToV vbroadcastf128 ymm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v convertloop: /* step 1 - subsample 32x2 argb pixels to 16x1 */ vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + 64] vmovdqu ymm3, [eax + 96] vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm1, ymm1, [eax + esi + 32] vpavgb ymm2, ymm2, [eax + esi + 64] vpavgb ymm3, ymm3, [eax + esi + 96] lea eax, [eax + 128] vshufps ymm4, ymm0, ymm1, 0x88 vshufps ymm0, ymm0, ymm1, 0xdd vpavgb ymm0, ymm0, ymm4 // mutated by vshufps vshufps ymm4, ymm2, ymm3, 0x88 vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 // mutated by vshufps // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 32 different pixels, its 16 pixels of U and 16 of V vpmaddubsw ymm1, ymm0, ymm7 // U vpmaddubsw ymm3, ymm2, ymm7 vpmaddubsw ymm0, ymm0, ymm6 // V vpmaddubsw ymm2, ymm2, ymm6 vphaddw ymm1, ymm1, ymm3 // mutates vphaddw ymm0, ymm0, ymm2 vpsraw ymm1, ymm1, 8 vpsraw ymm0, ymm0, 8 vpacksswb ymm0, ymm1, ymm0 // mutates vpermq ymm0, ymm0, 0xd8 // For vpacksswb vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw vpaddb ymm0, ymm0, ymm5 // -> unsigned // step 3 - store 16 U and 16 V values vextractf128 [edx], ymm0, 0 // U vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop pop edi pop esi vzeroupper ret } } #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_AVX2 __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_argb mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width vbroadcastf128 ymm5, xmmword ptr kBiasUV128 vbroadcastf128 ymm6, xmmword ptr kARGBToVJ vbroadcastf128 ymm7, xmmword ptr kARGBToUJ sub edi, edx // stride from u to v convertloop: /* step 1 - subsample 32x2 argb pixels to 16x1 */ vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + 64] vmovdqu ymm3, [eax + 96] vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm1, ymm1, [eax + esi + 32] vpavgb ymm2, ymm2, [eax + esi + 64] vpavgb ymm3, ymm3, [eax + esi + 96] lea eax, [eax + 128] vshufps ymm4, ymm0, ymm1, 0x88 vshufps ymm0, ymm0, ymm1, 0xdd vpavgb ymm0, ymm0, ymm4 // mutated by vshufps vshufps ymm4, ymm2, ymm3, 0x88 vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 // mutated by vshufps // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 32 different pixels, its 16 pixels of U and 16 of V vpmaddubsw ymm1, ymm0, ymm7 // U vpmaddubsw ymm3, ymm2, ymm7 vpmaddubsw ymm0, ymm0, ymm6 // V vpmaddubsw ymm2, ymm2, ymm6 vphaddw ymm1, ymm1, ymm3 // mutates vphaddw ymm0, ymm0, ymm2 vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned vpaddw ymm0, ymm0, ymm5 vpsraw ymm1, ymm1, 8 vpsraw ymm0, ymm0, 8 vpacksswb ymm0, ymm1, ymm0 // mutates vpermq ymm0, ymm0, 0xd8 // For vpacksswb vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw // step 3 - store 16 U and 16 V values vextractf128 [edx], ymm0, 0 // U vextractf128 [edx + edi], ymm0, 1 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop pop edi pop esi vzeroupper ret } } #endif // HAS_ARGBTOUVJROW_AVX2 __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_argb mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kARGBToV movdqa xmm7, xmmword ptr kARGBToU sub edi, edx // stride from u to v convertloop: /* convert to U and V */ movdqu xmm0, [eax] // U movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm7 pmaddubsw xmm1, xmm7 pmaddubsw xmm2, xmm7 pmaddubsw xmm3, xmm7 phaddw xmm0, xmm1 phaddw xmm2, xmm3 psraw xmm0, 8 psraw xmm2, 8 packsswb xmm0, xmm2 paddb xmm0, xmm5 movdqu [edx], xmm0 movdqu xmm0, [eax] // V movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] pmaddubsw xmm0, xmm6 pmaddubsw xmm1, xmm6 pmaddubsw xmm2, xmm6 pmaddubsw xmm3, xmm6 phaddw xmm0, xmm1 phaddw xmm2, xmm3 psraw xmm0, 8 psraw xmm2, 8 packsswb xmm0, xmm2 paddb xmm0, xmm5 lea eax, [eax + 64] movdqu [edx + edi], xmm0 lea edx, [edx + 16] sub ecx, 16 jg convertloop pop edi ret } } __declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_argb mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kBGRAToV movdqa xmm7, xmmword ptr kBGRAToU sub edi, edx // stride from u to v convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 movdqu xmm1, [eax + 16] movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 movdqu xmm2, [eax + 32] movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 movdqu xmm3, [eax + 48] movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm4, xmm1, 0xdd pavgb xmm0, xmm4 movdqa xmm4, xmm2 shufps xmm2, xmm3, 0x88 shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U pmaddubsw xmm2, xmm7 pmaddubsw xmm1, xmm6 // V pmaddubsw xmm3, xmm6 phaddw xmm0, xmm2 phaddw xmm1, xmm3 psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop pop edi pop esi ret } } __declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_argb mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kABGRToV movdqa xmm7, xmmword ptr kABGRToU sub edi, edx // stride from u to v convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 movdqu xmm1, [eax + 16] movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 movdqu xmm2, [eax + 32] movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 movdqu xmm3, [eax + 48] movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm4, xmm1, 0xdd pavgb xmm0, xmm4 movdqa xmm4, xmm2 shufps xmm2, xmm3, 0x88 shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U pmaddubsw xmm2, xmm7 pmaddubsw xmm1, xmm6 // V pmaddubsw xmm3, xmm6 phaddw xmm0, xmm2 phaddw xmm1, xmm3 psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop pop edi pop esi ret } } __declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_argb mov esi, [esp + 8 + 8] // src_stride_argb mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width movdqa xmm5, xmmword ptr kBiasUV128 movdqa xmm6, xmmword ptr kRGBAToV movdqa xmm7, xmmword ptr kRGBAToU sub edi, edx // stride from u to v convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] movdqu xmm4, [eax + esi] pavgb xmm0, xmm4 movdqu xmm1, [eax + 16] movdqu xmm4, [eax + esi + 16] pavgb xmm1, xmm4 movdqu xmm2, [eax + 32] movdqu xmm4, [eax + esi + 32] pavgb xmm2, xmm4 movdqu xmm3, [eax + 48] movdqu xmm4, [eax + esi + 48] pavgb xmm3, xmm4 lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 shufps xmm4, xmm1, 0xdd pavgb xmm0, xmm4 movdqa xmm4, xmm2 shufps xmm2, xmm3, 0x88 shufps xmm4, xmm3, 0xdd pavgb xmm2, xmm4 // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 16 different pixels, its 8 pixels of U and 8 of V movdqa xmm1, xmm0 movdqa xmm3, xmm2 pmaddubsw xmm0, xmm7 // U pmaddubsw xmm2, xmm7 pmaddubsw xmm1, xmm6 // V pmaddubsw xmm3, xmm6 phaddw xmm0, xmm2 phaddw xmm1, xmm3 psraw xmm0, 8 psraw xmm1, 8 packsswb xmm0, xmm1 paddb xmm0, xmm5 // -> unsigned // step 3 - store 8 U and 8 V values movlps qword ptr [edx], xmm0 // U movhps qword ptr [edx + edi], xmm0 // V lea edx, [edx + 8] sub ecx, 16 jg convertloop pop edi pop esi ret } } #endif // HAS_ARGBTOYROW_SSSE3 // Read 16 UV from 444 #define READYUV444_AVX2 \ __asm { \ __asm vmovdqu xmm3, [esi] /* U */ \ __asm vmovdqu xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm3, ymm3, 0xd8 \ __asm vpermq ymm1, ymm1, 0xd8 \ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} // Read 16 UV from 444. With 16 Alpha. #define READYUVA444_AVX2 \ __asm { \ __asm vmovdqu xmm3, [esi] /* U */ \ __asm vmovdqu xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm3, ymm3, 0xd8 \ __asm vpermq ymm1, ymm1, 0xd8 \ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16] \ __asm vmovdqu xmm5, [ebp] /* A */ \ __asm vpermq ymm5, ymm5, 0xd8 \ __asm lea ebp, [ebp + 16]} // Read 8 UV from 422, upsample to 16 UV. #define READYUV422_AVX2 \ __asm { \ __asm vmovq xmm3, qword ptr [esi] /* U */ \ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ __asm vpermq ymm3, ymm3, 0xd8 \ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. #define READYUVA422_AVX2 \ __asm { \ __asm vmovq xmm3, qword ptr [esi] /* U */ \ __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ __asm vpunpcklbw ymm3, ymm3, ymm1 /* UV */ \ __asm vpermq ymm3, ymm3, 0xd8 \ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16] \ __asm vmovdqu xmm5, [ebp] /* A */ \ __asm vpermq ymm5, ymm5, 0xd8 \ __asm lea ebp, [ebp + 16]} // Read 8 UV from NV12, upsample to 16 UV. #define READNV12_AVX2 \ __asm { \ __asm vmovdqu xmm3, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm3, ymm3, 0xd8 \ __asm vpunpcklwd ymm3, ymm3, ymm3 /* UVUV (upsample) */ \ __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} // Read 8 UV from NV21, upsample to 16 UV. #define READNV21_AVX2 \ __asm { \ __asm vmovdqu xmm3, [esi] /* UV */ \ __asm lea esi, [esi + 16] \ __asm vpermq ymm3, ymm3, 0xd8 \ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleNV21 \ __asm vmovdqu xmm4, [eax] /* Y */ \ __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ __asm lea eax, [eax + 16]} // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. #define READYUY2_AVX2 \ __asm { \ __asm vmovdqu ymm4, [eax] /* YUY2 */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ __asm vmovdqu ymm3, [eax] /* UV */ \ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleYUY2UV \ __asm lea eax, [eax + 32]} // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. #define READUYVY_AVX2 \ __asm { \ __asm vmovdqu ymm4, [eax] /* UYVY */ \ __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ __asm vmovdqu ymm3, [eax] /* UV */ \ __asm vpshufb ymm3, ymm3, ymmword ptr kShuffleUYVYUV \ __asm lea eax, [eax + 32]} // Convert 16 pixels: 16 UV and 16 Y. #define YUVTORGB_AVX2(YuvConstants) \ __asm { \ __asm vpsubb ymm3, ymm3, ymmword ptr kBiasUV128 \ __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ __asm vmovdqa ymm0, ymmword ptr [YuvConstants + KUVTOB] \ __asm vmovdqa ymm1, ymmword ptr [YuvConstants + KUVTOG] \ __asm vmovdqa ymm2, ymmword ptr [YuvConstants + KUVTOR] \ __asm vpmaddubsw ymm0, ymm0, ymm3 /* B UV */ \ __asm vpmaddubsw ymm1, ymm1, ymm3 /* G UV */ \ __asm vpmaddubsw ymm2, ymm2, ymm3 /* B UV */ \ __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KYBIASTORGB] \ __asm vpaddw ymm4, ymm3, ymm4 \ __asm vpaddsw ymm0, ymm0, ymm4 \ __asm vpsubsw ymm1, ymm4, ymm1 \ __asm vpaddsw ymm2, ymm2, ymm4 \ __asm vpsraw ymm0, ymm0, 6 \ __asm vpsraw ymm1, ymm1, 6 \ __asm vpsraw ymm2, ymm2, 6 \ __asm vpackuswb ymm0, ymm0, ymm0 \ __asm vpackuswb ymm1, ymm1, ymm1 \ __asm vpackuswb ymm2, ymm2, ymm2} // Store 16 ARGB values. #define STOREARGB_AVX2 \ __asm { \ __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ __asm vpermq ymm2, ymm2, 0xd8 \ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ __asm vmovdqu 0[edx], ymm1 \ __asm vmovdqu 32[edx], ymm0 \ __asm lea edx, [edx + 64]} // Store 16 RGBA values. #define STORERGBA_AVX2 \ __asm { \ __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ __asm vpermq ymm1, ymm1, 0xd8 \ __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ __asm vpermq ymm2, ymm2, 0xd8 \ __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ __asm vmovdqu [edx], ymm0 \ __asm vmovdqu [edx + 32], ymm1 \ __asm lea edx, [edx + 64]} #ifdef HAS_I422TOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) void I422ToARGBRow_AVX2( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx mov eax, [esp + 12 + 4] // Y mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV422_AVX2 YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop pop ebx pop edi pop esi vzeroupper ret } } #endif // HAS_I422TOARGBROW_AVX2 #ifdef HAS_I422ALPHATOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. __declspec(naked) void I422AlphaToARGBRow_AVX2( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx push ebp mov eax, [esp + 16 + 4] // Y mov esi, [esp + 16 + 8] // U mov edi, [esp + 16 + 12] // V mov ebp, [esp + 16 + 16] // A mov edx, [esp + 16 + 20] // argb mov ebx, [esp + 16 + 24] // yuvconstants mov ecx, [esp + 16 + 28] // width sub edi, esi convertloop: READYUVA422_AVX2 YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop pop ebp pop ebx pop edi pop esi vzeroupper ret } } #endif // HAS_I422ALPHATOARGBROW_AVX2 #ifdef HAS_I444TOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) void I444ToARGBRow_AVX2( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx mov eax, [esp + 12 + 4] // Y mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV444_AVX2 YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop pop ebx pop edi pop esi vzeroupper ret } } #endif // HAS_I444TOARGBROW_AVX2 #ifdef HAS_I444ALPHATOARGBROW_AVX2 // 16 pixels // 16 UV values with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) void I444AlphaToARGBRow_AVX2( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx push ebp mov eax, [esp + 16 + 4] // Y mov esi, [esp + 16 + 8] // U mov edi, [esp + 16 + 12] // V mov ebp, [esp + 16 + 16] // A mov edx, [esp + 16 + 20] // argb mov ebx, [esp + 16 + 24] // yuvconstants mov ecx, [esp + 16 + 28] // width sub edi, esi convertloop: READYUVA444_AVX2 YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop pop ebp pop ebx pop edi pop esi vzeroupper ret } } #endif // HAS_I444AlphaTOARGBROW_AVX2 #ifdef HAS_NV12TOARGBROW_AVX2 // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) void NV12ToARGBRow_AVX2( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push ebx mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // UV mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READNV12_AVX2 YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop pop ebx pop esi vzeroupper ret } } #endif // HAS_NV12TOARGBROW_AVX2 #ifdef HAS_NV21TOARGBROW_AVX2 // 16 pixels. // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) void NV21ToARGBRow_AVX2( const uint8_t* y_buf, const uint8_t* vu_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push ebx mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // VU mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READNV21_AVX2 YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop pop ebx pop esi vzeroupper ret } } #endif // HAS_NV21TOARGBROW_AVX2 #ifdef HAS_YUY2TOARGBROW_AVX2 // 16 pixels. // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). __declspec(naked) void YUY2ToARGBRow_AVX2( const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push ebx mov eax, [esp + 4 + 4] // yuy2 mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUY2_AVX2 YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop pop ebx vzeroupper ret } } #endif // HAS_YUY2TOARGBROW_AVX2 #ifdef HAS_UYVYTOARGBROW_AVX2 // 16 pixels. // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). __declspec(naked) void UYVYToARGBRow_AVX2( const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push ebx mov eax, [esp + 4 + 4] // uyvy mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READUYVY_AVX2 YUVTORGB_AVX2(ebx) STOREARGB_AVX2 sub ecx, 16 jg convertloop pop ebx vzeroupper ret } } #endif // HAS_UYVYTOARGBROW_AVX2 #ifdef HAS_I422TORGBAROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). __declspec(naked) void I422ToRGBARow_AVX2( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx mov eax, [esp + 12 + 4] // Y mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // abgr mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha convertloop: READYUV422_AVX2 YUVTORGB_AVX2(ebx) STORERGBA_AVX2 sub ecx, 16 jg convertloop pop ebx pop edi pop esi vzeroupper ret } } #endif // HAS_I422TORGBAROW_AVX2 #if defined(HAS_I422TOARGBROW_SSSE3) // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. // Allows a conversion with half size scaling. // Read 8 UV from 444. #define READYUV444 \ __asm { \ __asm movq xmm3, qword ptr [esi] /* U */ \ __asm movq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ __asm punpcklbw xmm3, xmm1 /* UV */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} // Read 4 UV from 444. With 8 Alpha. #define READYUVA444 \ __asm { \ __asm movq xmm3, qword ptr [esi] /* U */ \ __asm movq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ __asm punpcklbw xmm3, xmm1 /* UV */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8] \ __asm movq xmm5, qword ptr [ebp] /* A */ \ __asm lea ebp, [ebp + 8]} // Read 4 UV from 422, upsample to 8 UV. #define READYUV422 \ __asm { \ __asm movd xmm3, [esi] /* U */ \ __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ __asm punpcklbw xmm3, xmm1 /* UV */ \ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ __asm { \ __asm movd xmm3, [esi] /* U */ \ __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ __asm punpcklbw xmm3, xmm1 /* UV */ \ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] /* Y */ \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8] \ __asm movq xmm5, qword ptr [ebp] /* A */ \ __asm lea ebp, [ebp + 8]} // Read 4 UV from NV12, upsample to 8 UV. #define READNV12 \ __asm { \ __asm movq xmm3, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ __asm punpcklwd xmm3, xmm3 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} // Read 4 VU from NV21, upsample to 8 UV. #define READNV21 \ __asm { \ __asm movq xmm3, qword ptr [esi] /* UV */ \ __asm lea esi, [esi + 8] \ __asm pshufb xmm3, xmmword ptr kShuffleNV21 \ __asm movq xmm4, qword ptr [eax] \ __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8]} // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. #define READYUY2 \ __asm { \ __asm movdqu xmm4, [eax] /* YUY2 */ \ __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ __asm movdqu xmm3, [eax] /* UV */ \ __asm pshufb xmm3, xmmword ptr kShuffleYUY2UV \ __asm lea eax, [eax + 16]} // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. #define READUYVY \ __asm { \ __asm movdqu xmm4, [eax] /* UYVY */ \ __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ __asm movdqu xmm3, [eax] /* UV */ \ __asm pshufb xmm3, xmmword ptr kShuffleUYVYUV \ __asm lea eax, [eax + 16]} // Convert 8 pixels: 8 UV and 8 Y. #define YUVTORGB(YuvConstants) \ __asm { \ __asm psubb xmm3, xmmword ptr kBiasUV128 \ __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVTOB] \ __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVTOG] \ __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVTOR] \ __asm pmaddubsw xmm0, xmm3 \ __asm pmaddubsw xmm1, xmm3 \ __asm pmaddubsw xmm2, xmm3 \ __asm movdqa xmm3, xmmword ptr [YuvConstants + KYBIASTORGB] \ __asm paddw xmm4, xmm3 \ __asm paddsw xmm0, xmm4 \ __asm paddsw xmm2, xmm4 \ __asm psubsw xmm4, xmm1 \ __asm movdqa xmm1, xmm4 \ __asm psraw xmm0, 6 \ __asm psraw xmm1, 6 \ __asm psraw xmm2, 6 \ __asm packuswb xmm0, xmm0 /* B */ \ __asm packuswb xmm1, xmm1 /* G */ \ __asm packuswb xmm2, xmm2 /* R */ \ } // Store 8 ARGB values. #define STOREARGB \ __asm { \ __asm punpcklbw xmm0, xmm1 /* BG */ \ __asm punpcklbw xmm2, xmm5 /* RA */ \ __asm movdqa xmm1, xmm0 \ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm0 \ __asm movdqu 16[edx], xmm1 \ __asm lea edx, [edx + 32]} // Store 8 BGRA values. #define STOREBGRA \ __asm { \ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ __asm punpcklbw xmm1, xmm0 /* GB */ \ __asm punpcklbw xmm5, xmm2 /* AR */ \ __asm movdqa xmm0, xmm5 \ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ __asm lea edx, [edx + 32]} // Store 8 RGBA values. #define STORERGBA \ __asm { \ __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ __asm punpcklbw xmm1, xmm2 /* GR */ \ __asm punpcklbw xmm5, xmm0 /* AB */ \ __asm movdqa xmm0, xmm5 \ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ __asm movdqu 0[edx], xmm5 \ __asm movdqu 16[edx], xmm0 \ __asm lea edx, [edx + 32]} // Store 8 RGB24 values. #define STORERGB24 \ __asm {/* Weave into RRGB */ \ __asm punpcklbw xmm0, xmm1 /* BG */ \ __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ __asm lea edx, [edx + 24]} // Store 8 RGB565 values. #define STORERGB565 \ __asm {/* Weave into RRGB */ \ __asm punpcklbw xmm0, xmm1 /* BG */ \ __asm punpcklbw xmm2, xmm2 /* RR */ \ __asm movdqa xmm1, xmm0 \ __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ __asm movdqa xmm2, xmm0 /* G */ \ __asm pslld xmm0, 8 /* R */ \ __asm psrld xmm3, 3 /* B */ \ __asm psrld xmm2, 5 /* G */ \ __asm psrad xmm0, 16 /* R */ \ __asm pand xmm3, xmm5 /* B */ \ __asm pand xmm2, xmm6 /* G */ \ __asm pand xmm0, xmm7 /* R */ \ __asm por xmm3, xmm2 /* BG */ \ __asm por xmm0, xmm3 /* BGR */ \ __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ __asm movdqa xmm2, xmm1 /* G */ \ __asm pslld xmm1, 8 /* R */ \ __asm psrld xmm3, 3 /* B */ \ __asm psrld xmm2, 5 /* G */ \ __asm psrad xmm1, 16 /* R */ \ __asm pand xmm3, xmm5 /* B */ \ __asm pand xmm2, xmm6 /* G */ \ __asm pand xmm1, xmm7 /* R */ \ __asm por xmm3, xmm2 /* BG */ \ __asm por xmm1, xmm3 /* BGR */ \ __asm packssdw xmm0, xmm1 \ __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ __asm lea edx, [edx + 16]} // 8 pixels. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) void I444ToARGBRow_SSSE3( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx mov eax, [esp + 12 + 4] // Y mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUV444 YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop pop ebx pop edi pop esi ret } } // 8 pixels. // 8 UV values, mixed with 8 Y and 8A producing 8 ARGB (32 bytes). __declspec(naked) void I444AlphaToARGBRow_SSSE3( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx push ebp mov eax, [esp + 16 + 4] // Y mov esi, [esp + 16 + 8] // U mov edi, [esp + 16 + 12] // V mov ebp, [esp + 16 + 16] // A mov edx, [esp + 16 + 20] // argb mov ebx, [esp + 16 + 24] // yuvconstants mov ecx, [esp + 16 + 28] // width sub edi, esi convertloop: READYUVA444 YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop pop ebp pop ebx pop edi pop esi ret } } // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). __declspec(naked) void I422ToRGB24Row_SSSE3( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx mov eax, [esp + 12 + 4] // Y mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 convertloop: READYUV422 YUVTORGB(ebx) STORERGB24 sub ecx, 8 jg convertloop pop ebx pop edi pop esi ret } } // 8 pixels // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). __declspec(naked) void I422ToRGB565Row_SSSE3( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb565_buf, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx mov eax, [esp + 12 + 4] // Y mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate mask 0x0000001f psrld xmm5, 27 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 psrld xmm6, 26 pslld xmm6, 5 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 pslld xmm7, 11 convertloop: READYUV422 YUVTORGB(ebx) STORERGB565 sub ecx, 8 jg convertloop pop ebx pop edi pop esi ret } } // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) void I422ToARGBRow_SSSE3( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx mov eax, [esp + 12 + 4] // Y mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUV422 YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop pop ebx pop edi pop esi ret } } // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. __declspec(naked) void I422AlphaToARGBRow_SSSE3( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx push ebp mov eax, [esp + 16 + 4] // Y mov esi, [esp + 16 + 8] // U mov edi, [esp + 16 + 12] // V mov ebp, [esp + 16 + 16] // A mov edx, [esp + 16 + 20] // argb mov ebx, [esp + 16 + 24] // yuvconstants mov ecx, [esp + 16 + 28] // width sub edi, esi convertloop: READYUVA422 YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop pop ebp pop ebx pop edi pop esi ret } } // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) void NV12ToARGBRow_SSSE3( const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push ebx mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // UV mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READNV12 YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop pop ebx pop esi ret } } // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) void NV21ToARGBRow_SSSE3( const uint8_t* y_buf, const uint8_t* vu_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push ebx mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // VU mov edx, [esp + 8 + 12] // argb mov ebx, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READNV21 YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop pop ebx pop esi ret } } // 8 pixels. // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). __declspec(naked) void YUY2ToARGBRow_SSSE3( const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push ebx mov eax, [esp + 4 + 4] // yuy2 mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READYUY2 YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop pop ebx ret } } // 8 pixels. // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). __declspec(naked) void UYVYToARGBRow_SSSE3( const uint8_t* src_uyvy, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width) { __asm { push ebx mov eax, [esp + 4 + 4] // uyvy mov edx, [esp + 4 + 8] // argb mov ebx, [esp + 4 + 12] // yuvconstants mov ecx, [esp + 4 + 16] // width pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha convertloop: READUYVY YUVTORGB(ebx) STOREARGB sub ecx, 8 jg convertloop pop ebx ret } } __declspec(naked) void I422ToRGBARow_SSSE3( const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* dst_rgba, const struct YuvConstants* yuvconstants, int width) { __asm { push esi push edi push ebx mov eax, [esp + 12 + 4] // Y mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb mov ebx, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi convertloop: READYUV422 YUVTORGB(ebx) STORERGBA sub ecx, 8 jg convertloop pop ebx pop edi pop esi ret } } #endif // HAS_I422TOARGBROW_SSSE3 // I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter #ifdef HAS_I400TOARGBROW_SSE2 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* rgb_buf, const struct YuvConstants*, int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) movd xmm2, eax pshufd xmm2, xmm2,0 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) movd xmm3, eax pshufd xmm3, xmm3, 0 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 mov eax, [esp + 4] // Y mov edx, [esp + 8] // rgb mov ecx, [esp + 12] // width convertloop: // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 movq xmm0, qword ptr [eax] lea eax, [eax + 8] punpcklbw xmm0, xmm0 // Y.Y pmulhuw xmm0, xmm2 psubusw xmm0, xmm3 psrlw xmm0, 6 packuswb xmm0, xmm0 // G // Step 2: Weave into ARGB punpcklbw xmm0, xmm0 // GG movdqa xmm1, xmm0 punpcklwd xmm0, xmm0 // BGRA first 4 pixels punpckhwd xmm1, xmm1 // BGRA next 4 pixels por xmm0, xmm4 por xmm1, xmm4 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop ret } } #endif // HAS_I400TOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* rgb_buf, const struct YuvConstants*, int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) vmovd xmm2, eax vbroadcastss ymm2, xmm2 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) vmovd xmm3, eax vbroadcastss ymm3, xmm3 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 vpslld ymm4, ymm4, 24 mov eax, [esp + 4] // Y mov edx, [esp + 8] // rgb mov ecx, [esp + 12] // width convertloop: // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 vmovdqu xmm0, [eax] lea eax, [eax + 16] vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates vpunpcklbw ymm0, ymm0, ymm0 // Y.Y vpmulhuw ymm0, ymm0, ymm2 vpsubusw ymm0, ymm0, ymm3 vpsrlw ymm0, ymm0, 6 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 // TODO(fbarchard): Weave alpha with unpack. // Step 2: Weave into ARGB vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates vpermq ymm1, ymm1, 0xd8 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels vpor ymm0, ymm0, ymm4 vpor ymm1, ymm1, ymm4 vmovdqu [edx], ymm0 vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] sub ecx, 16 jg convertloop vzeroupper ret } } #endif // HAS_I400TOARGBROW_AVX2 #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; // TODO(fbarchard): Replace lea with -16 offset. __declspec(naked) void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width movdqa xmm5, xmmword ptr kShuffleMirror convertloop: movdqu xmm0, [eax - 16 + ecx] pshufb xmm0, xmm5 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg convertloop ret } } #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 __declspec(naked) void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width vbroadcastf128 ymm5, xmmword ptr kShuffleMirror convertloop: vmovdqu ymm0, [eax - 32 + ecx] vpshufb ymm0, ymm0, ymm5 vpermq ymm0, ymm0, 0x4e // swap high and low halfs vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 jg convertloop vzeroupper ret } } #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORSPLITUVROW_SSSE3 // Shuffle table for reversing the bytes of UV channels. static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; __declspec(naked) void MirrorSplitUVRow_SSSE3(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width movdqa xmm1, xmmword ptr kShuffleMirrorUV lea eax, [eax + ecx * 2 - 16] sub edi, edx convertloop: movdqu xmm0, [eax] lea eax, [eax - 16] pshufb xmm0, xmm1 movlpd qword ptr [edx], xmm0 movhpd qword ptr [edx + edi], xmm0 lea edx, [edx + 8] sub ecx, 8 jg convertloop pop edi ret } } #endif // HAS_MIRRORSPLITUVROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width lea eax, [eax - 16 + ecx * 4] // last 4 pixels. convertloop: movdqu xmm0, [eax] lea eax, [eax - 16] pshufd xmm0, xmm0, 0x1b movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg convertloop ret } } #endif // HAS_ARGBMIRRORROW_SSE2 #ifdef HAS_ARGBMIRRORROW_AVX2 // Shuffle table for reversing the bytes. static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; __declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 convertloop: vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 8 jg convertloop vzeroupper ret } } #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 __declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_uv mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 movdqa xmm3, xmm1 pand xmm0, xmm5 // even bytes pand xmm1, xmm5 packuswb xmm0, xmm1 psrlw xmm2, 8 // odd bytes psrlw xmm3, 8 packuswb xmm2, xmm3 movdqu [edx], xmm0 movdqu [edx + edi], xmm2 lea edx, [edx + 16] sub ecx, 16 jg convertloop pop edi ret } } #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_SPLITUVROW_AVX2 __declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_uv mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] vpsrlw ymm2, ymm0, 8 // odd bytes vpsrlw ymm3, ymm1, 8 vpand ymm0, ymm0, ymm5 // even bytes vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 vpackuswb ymm2, ymm2, ymm3 vpermq ymm0, ymm0, 0xd8 vpermq ymm2, ymm2, 0xd8 vmovdqu [edx], ymm0 vmovdqu [edx + edi], ymm2 lea edx, [edx + 32] sub ecx, 32 jg convertloop pop edi vzeroupper ret } } #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_MERGEUVROW_SSE2 __declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_u mov edx, [esp + 4 + 8] // src_v mov edi, [esp + 4 + 12] // dst_uv mov ecx, [esp + 4 + 16] // width sub edx, eax convertloop: movdqu xmm0, [eax] // read 16 U's movdqu xmm1, [eax + edx] // and 16 V's lea eax, [eax + 16] movdqa xmm2, xmm0 punpcklbw xmm0, xmm1 // first 8 UV pairs punpckhbw xmm2, xmm1 // next 8 UV pairs movdqu [edi], xmm0 movdqu [edi + 16], xmm2 lea edi, [edi + 32] sub ecx, 16 jg convertloop pop edi ret } } #endif // HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_u mov edx, [esp + 4 + 8] // src_v mov edi, [esp + 4 + 12] // dst_uv mov ecx, [esp + 4 + 16] // width sub edx, eax convertloop: vmovdqu ymm0, [eax] // read 32 U's vmovdqu ymm1, [eax + edx] // and 32 V's lea eax, [eax + 32] vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 vextractf128 [edi], ymm2, 0 // bytes 0..15 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 lea edi, [edi + 64] sub ecx, 32 jg convertloop pop edi vzeroupper ret } } #endif // HAS_MERGEUVROW_AVX2 #ifdef HAS_COPYROW_SSE2 // CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time. __declspec(naked) void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width test eax, 15 jne convertloopu test edx, 15 jne convertloopu convertloopa: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] movdqa [edx], xmm0 movdqa [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 32 jg convertloopa ret convertloopu: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 32 jg convertloopu ret } } #endif // HAS_COPYROW_SSE2 #ifdef HAS_COPYROW_AVX // CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time. __declspec(naked) void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] vmovdqu [edx], ymm0 vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] sub ecx, 64 jg convertloop vzeroupper ret } } #endif // HAS_COPYROW_AVX // Multiple of 1. __declspec(naked) void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { __asm { mov eax, esi mov edx, edi mov esi, [esp + 4] // src mov edi, [esp + 8] // dst mov ecx, [esp + 12] // width rep movsb mov edi, edx mov esi, eax ret } } #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width pcmpeqb xmm0, xmm0 // generate mask 0xff000000 pslld xmm0, 24 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff psrld xmm1, 8 convertloop: movdqu xmm2, [eax] movdqu xmm3, [eax + 16] lea eax, [eax + 32] movdqu xmm4, [edx] movdqu xmm5, [edx + 16] pand xmm2, xmm0 pand xmm3, xmm0 pand xmm4, xmm1 pand xmm5, xmm1 por xmm2, xmm4 por xmm3, xmm5 movdqu [edx], xmm2 movdqu [edx + 16], xmm3 lea edx, [edx + 32] sub ecx, 8 jg convertloop ret } } #endif // HAS_ARGBCOPYALPHAROW_SSE2 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width vpcmpeqb ymm0, ymm0, ymm0 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff convertloop: vmovdqu ymm1, [eax] vmovdqu ymm2, [eax + 32] lea eax, [eax + 64] vpblendvb ymm1, ymm1, [edx], ymm0 vpblendvb ymm2, ymm2, [edx + 32], ymm0 vmovdqu [edx], ymm1 vmovdqu [edx + 32], ymm2 lea edx, [edx + 64] sub ecx, 16 jg convertloop vzeroupper ret } } #endif // HAS_ARGBCOPYALPHAROW_AVX2 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 // width in pixels __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb, uint8_t* dst_a, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_a mov ecx, [esp + 12] // width extractloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrld xmm0, 24 psrld xmm1, 24 packssdw xmm0, xmm1 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 lea edx, [edx + 8] sub ecx, 8 jg extractloop ret } } #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 // width in pixels __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb, uint8_t* dst_a, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_a mov ecx, [esp + 12] // width vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX extractloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vpsrld ymm0, ymm0, 24 vpsrld ymm1, ymm1, 24 vmovdqu ymm2, [eax + 64] vmovdqu ymm3, [eax + 96] lea eax, [eax + 128] vpackssdw ymm0, ymm0, ymm1 // mutates vpsrld ymm2, ymm2, 24 vpsrld ymm3, ymm3, 24 vpackssdw ymm2, ymm2, ymm3 // mutates vpackuswb ymm0, ymm0, ymm2 // mutates vpermd ymm0, ymm4, ymm0 // unmutate vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 jg extractloop vzeroupper ret } } #endif // HAS_ARGBEXTRACTALPHAROW_AVX2 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width pcmpeqb xmm0, xmm0 // generate mask 0xff000000 pslld xmm0, 24 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff psrld xmm1, 8 convertloop: movq xmm2, qword ptr [eax] // 8 Y's lea eax, [eax + 8] punpcklbw xmm2, xmm2 punpckhwd xmm3, xmm2 punpcklwd xmm2, xmm2 movdqu xmm4, [edx] movdqu xmm5, [edx + 16] pand xmm2, xmm0 pand xmm3, xmm0 pand xmm4, xmm1 pand xmm5, xmm1 por xmm2, xmm4 por xmm3, xmm5 movdqu [edx], xmm2 movdqu [edx + 16], xmm3 lea edx, [edx + 32] sub ecx, 8 jg convertloop ret } } #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { __asm { mov eax, [esp + 4] // src mov edx, [esp + 8] // dst mov ecx, [esp + 12] // width vpcmpeqb ymm0, ymm0, ymm0 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff convertloop: vpmovzxbd ymm1, qword ptr [eax] vpmovzxbd ymm2, qword ptr [eax + 8] lea eax, [eax + 16] vpslld ymm1, ymm1, 24 vpslld ymm2, ymm2, 24 vpblendvb ymm1, ymm1, [edx], ymm0 vpblendvb ymm2, ymm2, [edx + 32], ymm0 vmovdqu [edx], ymm1 vmovdqu [edx + 32], ymm2 lea edx, [edx + 64] sub ecx, 16 jg convertloop vzeroupper ret } } #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 // Write 'width' bytes using an 8 bit value repeated. // width should be multiple of 4. __declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { __asm { movzx eax, byte ptr [esp + 8] // v8 mov edx, 0x01010101 // Duplicate byte to all bytes. mul edx // overwrites edx with upper part of result. mov edx, edi mov edi, [esp + 4] // dst mov ecx, [esp + 12] // width shr ecx, 2 rep stosd mov edi, edx ret } } // Write 'width' bytes using an 8 bit value repeated. __declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { __asm { mov edx, edi mov edi, [esp + 4] // dst mov eax, [esp + 8] // v8 mov ecx, [esp + 12] // width rep stosb mov edi, edx ret } } // Write 'width' 32 bit values. __declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { __asm { mov edx, edi mov edi, [esp + 4] // dst mov eax, [esp + 8] // v32 mov ecx, [esp + 12] // width rep stosd mov edi, edx ret } } #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_AVX2 __declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] // src_yuy2 mov edx, [esp + 8] // dst_y mov ecx, [esp + 12] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] vpand ymm0, ymm0, ymm5 // even bytes are Y vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 jg convertloop vzeroupper ret } } __declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_yuy2 mov esi, [esp + 8 + 8] // stride_yuy2 mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm1, ymm1, [eax + esi + 32] lea eax, [eax + 64] vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop pop edi pop esi vzeroupper ret } } __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop pop edi vzeroupper ret } } __declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] // src_uyvy mov edx, [esp + 8] // dst_y mov ecx, [esp + 12] // width convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] vpsrlw ymm0, ymm0, 8 // odd bytes are Y vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 jg convertloop vzeroupper ret } } __declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy, int stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_yuy2 mov esi, [esp + 8 + 8] // stride_yuy2 mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vpavgb ymm0, ymm0, [eax + esi] vpavgb ymm1, ymm1, [eax + esi + 32] lea eax, [eax + 64] vpand ymm0, ymm0, ymm5 // UYVY -> UVUV vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop pop edi pop esi vzeroupper ret } } __declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff vpsrlw ymm5, ymm5, 8 sub edi, edx convertloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] vpand ymm0, ymm0, ymm5 // UYVY -> UVUV vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 // mutates. vpermq ymm0, ymm0, 0xd8 vpand ymm1, ymm0, ymm5 // U vpsrlw ymm0, ymm0, 8 // V vpackuswb ymm1, ymm1, ymm1 // mutates. vpackuswb ymm0, ymm0, ymm0 // mutates. vpermq ymm1, ymm1, 0xd8 vpermq ymm0, ymm0, 0xd8 vextractf128 [edx], ymm1, 0 // U vextractf128 [edx + edi], ymm0, 0 // V lea edx, [edx + 16] sub ecx, 32 jg convertloop pop edi vzeroupper ret } } #endif // HAS_YUY2TOYROW_AVX2 #ifdef HAS_YUY2TOYROW_SSE2 __declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] // src_yuy2 mov edx, [esp + 8] // dst_y mov ecx, [esp + 12] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] pand xmm0, xmm5 // even bytes are Y pand xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg convertloop ret } } __declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_yuy2 mov esi, [esp + 8 + 8] // stride_yuy2 mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 psrlw xmm0, 8 // YUYV -> UVUV psrlw xmm1, 8 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 lea edx, [edx + 8] sub ecx, 16 jg convertloop pop edi pop esi ret } } __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrlw xmm0, 8 // YUYV -> UVUV psrlw xmm1, 8 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 lea edx, [edx + 8] sub ecx, 16 jg convertloop pop edi ret } } __declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { __asm { mov eax, [esp + 4] // src_uyvy mov edx, [esp + 8] // dst_y mov ecx, [esp + 12] // width convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrlw xmm0, 8 // odd bytes are Y psrlw xmm1, 8 packuswb xmm0, xmm1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg convertloop ret } } __declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy, int stride_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_yuy2 mov esi, [esp + 8 + 8] // stride_yuy2 mov edx, [esp + 8 + 12] // dst_u mov edi, [esp + 8 + 16] // dst_v mov ecx, [esp + 8 + 20] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 pavgb xmm1, xmm3 pand xmm0, xmm5 // UYVY -> UVUV pand xmm1, xmm5 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 lea edx, [edx + 8] sub ecx, 16 jg convertloop pop edi pop esi ret } } __declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy, uint8_t* dst_u, uint8_t* dst_v, int width) { __asm { push edi mov eax, [esp + 4 + 4] // src_yuy2 mov edx, [esp + 4 + 8] // dst_u mov edi, [esp + 4 + 12] // dst_v mov ecx, [esp + 4 + 16] // width pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff psrlw xmm5, 8 sub edi, edx convertloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] pand xmm0, xmm5 // UYVY -> UVUV pand xmm1, xmm5 packuswb xmm0, xmm1 movdqa xmm1, xmm0 pand xmm0, xmm5 // U packuswb xmm0, xmm0 psrlw xmm1, 8 // V packuswb xmm1, xmm1 movq qword ptr [edx], xmm0 movq qword ptr [edx + edi], xmm1 lea edx, [edx + 8] sub ecx, 16 jg convertloop pop edi ret } } #endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_BLENDPLANEROW_SSSE3 // Blend 8 pixels at a time. // unsigned version of math // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width) { __asm { push esi push edi pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 psllw xmm5, 8 mov eax, 0x80808080 // 128 for biasing image to signed. movd xmm6, eax pshufd xmm6, xmm6, 0x00 mov eax, 0x807f807f // 32768 + 127 for unbias and round. movd xmm7, eax pshufd xmm7, xmm7, 0x00 mov eax, [esp + 8 + 4] // src0 mov edx, [esp + 8 + 8] // src1 mov esi, [esp + 8 + 12] // alpha mov edi, [esp + 8 + 16] // dst mov ecx, [esp + 8 + 20] // width sub eax, esi sub edx, esi sub edi, esi // 8 pixel loop. convertloop8: movq xmm0, qword ptr [esi] // alpha punpcklbw xmm0, xmm0 pxor xmm0, xmm5 // a, 255-a movq xmm1, qword ptr [eax + esi] // src0 movq xmm2, qword ptr [edx + esi] // src1 punpcklbw xmm1, xmm2 psubb xmm1, xmm6 // bias src0/1 - 128 pmaddubsw xmm0, xmm1 paddw xmm0, xmm7 // unbias result - 32768 and round. psrlw xmm0, 8 packuswb xmm0, xmm0 movq qword ptr [edi + esi], xmm0 lea esi, [esi + 8] sub ecx, 8 jg convertloop8 pop edi pop esi ret } } #endif // HAS_BLENDPLANEROW_SSSE3 #ifdef HAS_BLENDPLANEROW_AVX2 // Blend 32 pixels at a time. // unsigned version of math // =((A2*C2)+(B2*(255-C2))+255)/256 // signed version of math // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 __declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, uint8_t* dst, int width) { __asm { push esi push edi vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 vpsllw ymm5, ymm5, 8 mov eax, 0x80808080 // 128 for biasing image to signed. vmovd xmm6, eax vbroadcastss ymm6, xmm6 mov eax, 0x807f807f // 32768 + 127 for unbias and round. vmovd xmm7, eax vbroadcastss ymm7, xmm7 mov eax, [esp + 8 + 4] // src0 mov edx, [esp + 8 + 8] // src1 mov esi, [esp + 8 + 12] // alpha mov edi, [esp + 8 + 16] // dst mov ecx, [esp + 8 + 20] // width sub eax, esi sub edx, esi sub edi, esi // 32 pixel loop. convertloop32: vmovdqu ymm0, [esi] // alpha vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 vpxor ymm3, ymm3, ymm5 // a, 255-a vpxor ymm0, ymm0, ymm5 // a, 255-a vmovdqu ymm1, [eax + esi] // src0 vmovdqu ymm2, [edx + esi] // src1 vpunpckhbw ymm4, ymm1, ymm2 vpunpcklbw ymm1, ymm1, ymm2 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 vpmaddubsw ymm3, ymm3, ymm4 vpmaddubsw ymm0, ymm0, ymm1 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. vpsrlw ymm3, ymm3, 8 vpsrlw ymm0, ymm0, 8 vpackuswb ymm0, ymm0, ymm3 vmovdqu [edi + esi], ymm0 lea esi, [esi + 32] sub ecx, 32 jg convertloop32 pop edi pop esi vzeroupper ret } } #endif // HAS_BLENDPLANEROW_AVX2 #ifdef HAS_ARGBBLENDROW_SSSE3 // Shuffle table for isolating alpha. static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; // Blend 8 pixels at a time. __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width pcmpeqb xmm7, xmm7 // generate constant 0x0001 psrlw xmm7, 15 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff psrlw xmm6, 8 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 psllw xmm5, 8 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 sub ecx, 4 jl convertloop4b // less than 4 pixels? // 4 pixel loop. convertloop4: movdqu xmm3, [eax] // src argb lea eax, [eax + 16] movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha movdqu xmm2, [esi] // _r_b pshufb xmm3, xmmword ptr kShuffleAlpha // alpha pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha movdqu xmm1, [esi] // _a_g lea esi, [esi + 16] psrlw xmm1, 8 // _a_g por xmm0, xmm4 // set alpha to 255 pmullw xmm1, xmm3 // _a_g * alpha psrlw xmm2, 8 // _r_b convert to 8 bits again paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jge convertloop4 convertloop4b: add ecx, 4 - 1 jl convertloop1b // 1 pixel loop. convertloop1: movd xmm3, [eax] // src argb lea eax, [eax + 4] movdqa xmm0, xmm3 // src argb pxor xmm3, xmm4 // ~alpha movd xmm2, [esi] // _r_b pshufb xmm3, xmmword ptr kShuffleAlpha // alpha pand xmm2, xmm6 // _r_b paddw xmm3, xmm7 // 256 - alpha pmullw xmm2, xmm3 // _r_b * alpha movd xmm1, [esi] // _a_g lea esi, [esi + 4] psrlw xmm1, 8 // _a_g por xmm0, xmm4 // set alpha to 255 pmullw xmm1, xmm3 // _a_g * alpha psrlw xmm2, 8 // _r_b convert to 8 bits again paddusb xmm0, xmm2 // + src argb pand xmm1, xmm5 // a_g_ convert to 8 bits again paddusb xmm0, xmm1 // + src argb movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 jge convertloop1 convertloop1b: pop esi ret } } #endif // HAS_ARGBBLENDROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha0 = { 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, }; static const uvec8 kShuffleAlpha1 = { 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, }; __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width pcmpeqb xmm3, xmm3 // generate mask 0xff000000 pslld xmm3, 24 movdqa xmm4, xmmword ptr kShuffleAlpha0 movdqa xmm5, xmmword ptr kShuffleAlpha1 convertloop: movdqu xmm0, [eax] // read 4 pixels pshufb xmm0, xmm4 // isolate first 2 alphas movdqu xmm1, [eax] // read 4 pixels punpcklbw xmm1, xmm1 // first 2 pixel rgbs pmulhuw xmm0, xmm1 // rgb * a movdqu xmm1, [eax] // read 4 pixels pshufb xmm1, xmm5 // isolate next 2 alphas movdqu xmm2, [eax] // read 4 pixels punpckhbw xmm2, xmm2 // next 2 pixel rgbs pmulhuw xmm1, xmm2 // rgb * a movdqu xmm2, [eax] // mask original alpha lea eax, [eax + 16] pand xmm2, xmm3 psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 por xmm0, xmm2 // copy original alpha movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg convertloop ret } } #endif // HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_AVX2 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u}; __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 vpslld ymm5, ymm5, 24 convertloop: vmovdqu ymm6, [eax] // read 8 pixels. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpshufb ymm2, ymm0, ymm4 // low 4 alphas vpshufb ymm3, ymm1, ymm4 // high 4 alphas vpmulhuw ymm0, ymm0, ymm2 // rgb * a vpmulhuw ymm1, ymm1, ymm3 // rgb * a vpand ymm6, ymm6, ymm5 // isolate alpha vpsrlw ymm0, ymm0, 8 vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 // unmutated. vpor ymm0, ymm0, ymm6 // copy original alpha vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] sub ecx, 8 jg convertloop vzeroupper ret } } #endif // HAS_ARGBATTENUATEROW_AVX2 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { push ebx push esi push edi mov eax, [esp + 12 + 4] // src_argb mov edx, [esp + 12 + 8] // dst_argb mov ecx, [esp + 12 + 12] // width lea ebx, fixed_invtbl8 convertloop: movdqu xmm0, [eax] // read 4 pixels movzx esi, byte ptr [eax + 3] // first alpha movzx edi, byte ptr [eax + 7] // second alpha punpcklbw xmm0, xmm0 // first 2 movd xmm2, dword ptr [ebx + esi * 4] movd xmm3, dword ptr [ebx + edi * 4] pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 pmulhuw xmm0, xmm2 // rgb * a movdqu xmm1, [eax] // read 4 pixels movzx esi, byte ptr [eax + 11] // third alpha movzx edi, byte ptr [eax + 15] // forth alpha punpckhbw xmm1, xmm1 // next 2 movd xmm2, dword ptr [ebx + esi * 4] movd xmm3, dword ptr [ebx + edi * 4] pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words movlhps xmm2, xmm3 pmulhuw xmm1, xmm2 // rgb * a lea eax, [eax + 16] packuswb xmm0, xmm1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg convertloop pop edi pop esi pop ebx ret } } #endif // HAS_ARGBUNATTENUATEROW_SSE2 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 // Shuffle table duplicating alpha. static const uvec8 kUnattenShuffleAlpha_AVX2 = { 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. // USE_GATHER is not on by default, due to being a slow instruction. #ifdef USE_GATHER __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width sub edx, eax vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 convertloop: vmovdqu ymm6, [eax] // read 8 pixels. vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia vpackuswb ymm0, ymm0, ymm1 // unmutated. vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] sub ecx, 8 jg convertloop vzeroupper ret } } #else // USE_GATHER __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { push ebx push esi push edi mov eax, [esp + 12 + 4] // src_argb mov edx, [esp + 12 + 8] // dst_argb mov ecx, [esp + 12 + 12] // width sub edx, eax lea ebx, fixed_invtbl8 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 convertloop: // replace VPGATHER movzx esi, byte ptr [eax + 3] // alpha0 movzx edi, byte ptr [eax + 7] // alpha1 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] movzx esi, byte ptr [eax + 11] // alpha2 movzx edi, byte ptr [eax + 15] // alpha3 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] movzx esi, byte ptr [eax + 19] // alpha4 movzx edi, byte ptr [eax + 23] // alpha5 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] movzx esi, byte ptr [eax + 27] // alpha6 movzx edi, byte ptr [eax + 31] // alpha7 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] // end of VPGATHER vmovdqu ymm6, [eax] // read 8 pixels. vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas vpmulhuw ymm0, ymm0, ymm2 // rgb * ia vpmulhuw ymm1, ymm1, ymm3 // rgb * ia vpackuswb ymm0, ymm0, ymm1 // unmutated. vmovdqu [eax + edx], ymm0 lea eax, [eax + 32] sub ecx, 8 jg convertloop pop edi pop esi pop ebx vzeroupper ret } } #endif // USE_GATHER #endif // HAS_ARGBATTENUATEROW_AVX2 #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. __declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_argb */ mov ecx, [esp + 12] /* width */ movdqa xmm4, xmmword ptr kARGBToYJ movdqa xmm5, xmmword ptr kAddYJ64 convertloop: movdqu xmm0, [eax] // G movdqu xmm1, [eax + 16] pmaddubsw xmm0, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm0, xmm1 paddw xmm0, xmm5 // Add .5 for rounding. psrlw xmm0, 7 packuswb xmm0, xmm0 // 8 G bytes movdqu xmm2, [eax] // A movdqu xmm3, [eax + 16] lea eax, [eax + 32] psrld xmm2, 24 psrld xmm3, 24 packuswb xmm2, xmm3 packuswb xmm2, xmm2 // 8 A bytes movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA punpcklbw xmm0, xmm0 // 8 GG words punpcklbw xmm3, xmm2 // 8 GA words movdqa xmm1, xmm0 punpcklwd xmm0, xmm3 // GGGA first 4 punpckhwd xmm1, xmm3 // GGGA next 4 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop ret } } #endif // HAS_ARGBGRAYROW_SSSE3 #ifdef HAS_ARGBSEPIAROW_SSSE3 // b = (r * 35 + g * 68 + b * 17) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 // Constant for ARGB color to sepia tone. static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0}; static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0}; static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0}; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { __asm { mov eax, [esp + 4] /* dst_argb */ mov ecx, [esp + 8] /* width */ movdqa xmm2, xmmword ptr kARGBToSepiaB movdqa xmm3, xmmword ptr kARGBToSepiaG movdqa xmm4, xmmword ptr kARGBToSepiaR convertloop: movdqu xmm0, [eax] // B movdqu xmm6, [eax + 16] pmaddubsw xmm0, xmm2 pmaddubsw xmm6, xmm2 phaddw xmm0, xmm6 psrlw xmm0, 7 packuswb xmm0, xmm0 // 8 B values movdqu xmm5, [eax] // G movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm3 pmaddubsw xmm1, xmm3 phaddw xmm5, xmm1 psrlw xmm5, 7 packuswb xmm5, xmm5 // 8 G values punpcklbw xmm0, xmm5 // 8 BG values movdqu xmm5, [eax] // R movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm5, xmm1 psrlw xmm5, 7 packuswb xmm5, xmm5 // 8 R values movdqu xmm6, [eax] // A movdqu xmm1, [eax + 16] psrld xmm6, 24 psrld xmm1, 24 packuswb xmm6, xmm1 packuswb xmm6, xmm6 // 8 A values punpcklbw xmm5, xmm6 // 8 RA values movdqa xmm1, xmm0 // Weave BG, RA together punpcklwd xmm0, xmm5 // BGRA first 4 punpckhwd xmm1, xmm5 // BGRA next 4 movdqu [eax], xmm0 movdqu [eax + 16], xmm1 lea eax, [eax + 32] sub ecx, 8 jg convertloop ret } } #endif // HAS_ARGBSEPIAROW_SSSE3 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 // Tranform 8 ARGB pixels (32 bytes) with color matrix. // Same as Sepia except matrix is provided. // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const int8_t* matrix_argb, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_argb */ mov ecx, [esp + 12] /* matrix_argb */ movdqu xmm5, [ecx] pshufd xmm2, xmm5, 0x00 pshufd xmm3, xmm5, 0x55 pshufd xmm4, xmm5, 0xaa pshufd xmm5, xmm5, 0xff mov ecx, [esp + 16] /* width */ convertloop: movdqu xmm0, [eax] // B movdqu xmm7, [eax + 16] pmaddubsw xmm0, xmm2 pmaddubsw xmm7, xmm2 movdqu xmm6, [eax] // G movdqu xmm1, [eax + 16] pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 phaddsw xmm0, xmm7 // B phaddsw xmm6, xmm1 // G psraw xmm0, 6 // B psraw xmm6, 6 // G packuswb xmm0, xmm0 // 8 B values packuswb xmm6, xmm6 // 8 G values punpcklbw xmm0, xmm6 // 8 BG values movdqu xmm1, [eax] // R movdqu xmm7, [eax + 16] pmaddubsw xmm1, xmm4 pmaddubsw xmm7, xmm4 phaddsw xmm1, xmm7 // R movdqu xmm6, [eax] // A movdqu xmm7, [eax + 16] pmaddubsw xmm6, xmm5 pmaddubsw xmm7, xmm5 phaddsw xmm6, xmm7 // A psraw xmm1, 6 // R psraw xmm6, 6 // A packuswb xmm1, xmm1 // 8 R values packuswb xmm6, xmm6 // 8 A values punpcklbw xmm1, xmm6 // 8 RA values movdqa xmm6, xmm0 // Weave BG, RA together punpcklwd xmm0, xmm1 // BGRA first 4 punpckhwd xmm6, xmm1 // BGRA next 4 movdqu [edx], xmm0 movdqu [edx + 16], xmm6 lea eax, [eax + 32] lea edx, [edx + 32] sub ecx, 8 jg convertloop ret } } #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, int scale, int interval_size, int interval_offset, int width) { __asm { mov eax, [esp + 4] /* dst_argb */ movd xmm2, [esp + 8] /* scale */ movd xmm3, [esp + 12] /* interval_size */ movd xmm4, [esp + 16] /* interval_offset */ mov ecx, [esp + 20] /* width */ pshuflw xmm2, xmm2, 040h pshufd xmm2, xmm2, 044h pshuflw xmm3, xmm3, 040h pshufd xmm3, xmm3, 044h pshuflw xmm4, xmm4, 040h pshufd xmm4, xmm4, 044h pxor xmm5, xmm5 // constant 0 pcmpeqb xmm6, xmm6 // generate mask 0xff000000 pslld xmm6, 24 convertloop: movdqu xmm0, [eax] // read 4 pixels punpcklbw xmm0, xmm5 // first 2 pixels pmulhuw xmm0, xmm2 // pixel * scale >> 16 movdqu xmm1, [eax] // read 4 pixels punpckhbw xmm1, xmm5 // next 2 pixels pmulhuw xmm1, xmm2 pmullw xmm0, xmm3 // * interval_size movdqu xmm7, [eax] // read 4 pixels pmullw xmm1, xmm3 pand xmm7, xmm6 // mask alpha paddw xmm0, xmm4 // + interval_size / 2 paddw xmm1, xmm4 packuswb xmm0, xmm1 por xmm0, xmm7 movdqu [eax], xmm0 lea eax, [eax + 16] sub ecx, 4 jg convertloop ret } } #endif // HAS_ARGBQUANTIZEROW_SSE2 #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, int width, uint32_t value) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // width movd xmm2, [esp + 16] // value punpcklbw xmm2, xmm2 punpcklqdq xmm2, xmm2 convertloop: movdqu xmm0, [eax] // read 4 pixels lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 // first 2 punpckhbw xmm1, xmm1 // next 2 pmulhuw xmm0, xmm2 // argb * value pmulhuw xmm1, xmm2 // argb * value psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg convertloop ret } } #endif // HAS_ARGBSHADEROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width pxor xmm5, xmm5 // constant 0 convertloop: movdqu xmm0, [eax] // read 4 pixels from src_argb movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm1, xmm0 movdqu xmm3, xmm2 punpcklbw xmm0, xmm0 // first 2 punpckhbw xmm1, xmm1 // next 2 punpcklbw xmm2, xmm5 // first 2 punpckhbw xmm3, xmm5 // next 2 pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2 pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2 lea eax, [eax + 16] lea esi, [esi + 16] packuswb xmm0, xmm1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg convertloop pop esi ret } } #endif // HAS_ARGBMULTIPLYROW_SSE2 #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. // TODO(fbarchard): Port this to posix, neon and other math functions. __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub ecx, 4 jl convertloop49 convertloop4: movdqu xmm0, [eax] // read 4 pixels from src_argb lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] paddusb xmm0, xmm1 // src_argb + src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jge convertloop4 convertloop49: add ecx, 4 - 1 jl convertloop19 convertloop1: movd xmm0, [eax] // read 1 pixels from src_argb lea eax, [eax + 4] movd xmm1, [esi] // read 1 pixels from src_argb1 lea esi, [esi + 4] paddusb xmm0, xmm1 // src_argb + src_argb1 movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 jge convertloop1 convertloop19: pop esi ret } } #endif // HAS_ARGBADDROW_SSE2 #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: movdqu xmm0, [eax] // read 4 pixels from src_argb lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] psubusb xmm0, xmm1 // src_argb - src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg convertloop pop esi ret } } #endif // HAS_ARGBSUBTRACTROW_SSE2 #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width vpxor ymm5, ymm5, ymm5 // constant 0 convertloop: vmovdqu ymm1, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 lea esi, [esi + 32] vpunpcklbw ymm0, ymm1, ymm1 // low 4 vpunpckhbw ymm1, ymm1, ymm1 // high 4 vpunpcklbw ymm2, ymm3, ymm5 // low 4 vpunpckhbw ymm3, ymm3, ymm5 // high 4 vpmulhuw ymm0, ymm0, ymm2 // src_argb * src_argb1 low 4 vpmulhuw ymm1, ymm1, ymm3 // src_argb * src_argb1 high 4 vpackuswb ymm0, ymm0, ymm1 vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 8 jg convertloop pop esi vzeroupper ret } } #endif // HAS_ARGBMULTIPLYROW_AVX2 #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 8 pixels at a time. __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: vmovdqu ymm0, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 lea esi, [esi + 32] vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 8 jg convertloop pop esi vzeroupper ret } } #endif // HAS_ARGBADDROW_AVX2 #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width convertloop: vmovdqu ymm0, [eax] // read 8 pixels from src_argb lea eax, [eax + 32] vpsubusb ymm0, ymm0, [esi] // src_argb - src_argb1 lea esi, [esi + 32] vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 8 jg convertloop pop esi vzeroupper ret } } #endif // HAS_ARGBSUBTRACTROW_AVX2 #ifdef HAS_SOBELXROW_SSE2 // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y1, const uint8_t* src_y2, uint8_t* dst_sobelx, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_y0 mov esi, [esp + 8 + 8] // src_y1 mov edi, [esp + 8 + 12] // src_y2 mov edx, [esp + 8 + 16] // dst_sobelx mov ecx, [esp + 8 + 20] // width sub esi, eax sub edi, eax sub edx, eax pxor xmm5, xmm5 // constant 0 convertloop: movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 psubw xmm2, xmm3 paddw xmm0, xmm2 paddw xmm0, xmm1 paddw xmm0, xmm1 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 movq qword ptr [eax + edx], xmm0 lea eax, [eax + 8] sub ecx, 8 jg convertloop pop edi pop esi ret } } #endif // HAS_SOBELXROW_SSE2 #ifdef HAS_SOBELYROW_SSE2 // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, const uint8_t* src_y1, uint8_t* dst_sobely, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_y0 mov esi, [esp + 4 + 8] // src_y1 mov edx, [esp + 4 + 12] // dst_sobely mov ecx, [esp + 4 + 16] // width sub esi, eax sub edx, eax pxor xmm5, xmm5 // constant 0 convertloop: movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 psubw xmm2, xmm3 paddw xmm0, xmm2 paddw xmm0, xmm1 paddw xmm0, xmm1 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw psubw xmm1, xmm0 pmaxsw xmm0, xmm1 packuswb xmm0, xmm0 movq qword ptr [eax + edx], xmm0 lea eax, [eax + 8] sub ecx, 8 jg convertloop pop esi ret } } #endif // HAS_SOBELYROW_SSE2 #ifdef HAS_SOBELROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. // A = 255 // R = Sobel // G = Sobel // B = Sobel __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_sobelx mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax pcmpeqb xmm5, xmm5 // alpha 255 pslld xmm5, 24 // 0xff000000 convertloop: movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely movdqa xmm2, xmm0 // GG punpcklbw xmm2, xmm0 // First 8 punpckhbw xmm0, xmm0 // Next 8 movdqa xmm1, xmm2 // GGGG punpcklwd xmm1, xmm2 // First 4 punpckhwd xmm2, xmm2 // Next 4 por xmm1, xmm5 // GGGA por xmm2, xmm5 movdqa xmm3, xmm0 // GGGG punpcklwd xmm3, xmm0 // Next 4 punpckhwd xmm0, xmm0 // Last 4 por xmm3, xmm5 // GGGA por xmm0, xmm5 movdqu [edx], xmm1 movdqu [edx + 16], xmm2 movdqu [edx + 32], xmm3 movdqu [edx + 48], xmm0 lea edx, [edx + 64] sub ecx, 16 jg convertloop pop esi ret } } #endif // HAS_SOBELROW_SSE2 #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_y, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_sobelx mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax convertloop: movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg convertloop pop esi ret } } #endif // HAS_SOBELTOPLANEROW_SSE2 #ifdef HAS_SOBELXYROW_SSE2 // Mixes Sobel X, Sobel Y and Sobel into ARGB. // A = 255 // R = Sobel X // G = Sobel // B = Sobel Y __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, const uint8_t* src_sobely, uint8_t* dst_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] // src_sobelx mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax pcmpeqb xmm5, xmm5 // alpha 255 convertloop: movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] movdqa xmm2, xmm0 paddusb xmm2, xmm1 // sobel = sobelx + sobely movdqa xmm3, xmm0 // XA punpcklbw xmm3, xmm5 punpckhbw xmm0, xmm5 movdqa xmm4, xmm1 // YS punpcklbw xmm4, xmm2 punpckhbw xmm1, xmm2 movdqa xmm6, xmm4 // YSXA punpcklwd xmm6, xmm3 // First 4 punpckhwd xmm4, xmm3 // Next 4 movdqa xmm7, xmm1 // YSXA punpcklwd xmm7, xmm0 // Next 4 punpckhwd xmm1, xmm0 // Last 4 movdqu [edx], xmm6 movdqu [edx + 16], xmm4 movdqu [edx + 32], xmm7 movdqu [edx + 48], xmm1 lea edx, [edx + 64] sub ecx, 16 jg convertloop pop esi ret } } #endif // HAS_SOBELXYROW_SSE2 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 // Consider float CumulativeSum. // Consider calling CumulativeSum one row at time as needed. // Consider circular CumulativeSum buffer of radius * 2 + 1 height. // Convert cumulative sum for an area to an average for 1 pixel. // topleft is pointer to top left of CumulativeSum buffer for area. // botleft is pointer to bottom left of CumulativeSum buffer. // width is offset from left to right of area in CumulativeSum buffer measured // in number of ints. // area is the number of pixels in the area being averaged. // dst points to pixel to store result to. // count is number of averaged pixels to produce. // Does 4 pixels at a time. // This function requires alignment on accumulation buffer pointers. void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, const int32_t* botleft, int width, int area, uint8_t* dst, int count) { __asm { mov eax, topleft // eax topleft mov esi, botleft // esi botleft mov edx, width movd xmm5, area mov edi, dst mov ecx, count cvtdq2ps xmm5, xmm5 rcpss xmm4, xmm5 // 1.0f / area pshufd xmm4, xmm4, 0 sub ecx, 4 jl l4b cmp area, 128 // 128 pixels will not overflow 15 bits. ja l4 pshufd xmm5, xmm5, 0 // area pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 psrld xmm6, 16 cvtdq2ps xmm6, xmm6 addps xmm5, xmm6 // (65536.0 + area - 1) mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area cvtps2dq xmm5, xmm5 // 0.16 fixed point packssdw xmm5, xmm5 // 16 bit shorts // 4 pixel loop small blocks. s4: // top left movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] // - top right psubd xmm0, [eax + edx * 4] psubd xmm1, [eax + edx * 4 + 16] psubd xmm2, [eax + edx * 4 + 32] psubd xmm3, [eax + edx * 4 + 48] lea eax, [eax + 64] // - bottom left psubd xmm0, [esi] psubd xmm1, [esi + 16] psubd xmm2, [esi + 32] psubd xmm3, [esi + 48] // + bottom right paddd xmm0, [esi + edx * 4] paddd xmm1, [esi + edx * 4 + 16] paddd xmm2, [esi + edx * 4 + 32] paddd xmm3, [esi + edx * 4 + 48] lea esi, [esi + 64] packssdw xmm0, xmm1 // pack 4 pixels into 2 registers packssdw xmm2, xmm3 pmulhuw xmm0, xmm5 pmulhuw xmm2, xmm5 packuswb xmm0, xmm2 movdqu [edi], xmm0 lea edi, [edi + 16] sub ecx, 4 jge s4 jmp l4b // 4 pixel loop l4: // top left movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + 32] movdqu xmm3, [eax + 48] // - top right psubd xmm0, [eax + edx * 4] psubd xmm1, [eax + edx * 4 + 16] psubd xmm2, [eax + edx * 4 + 32] psubd xmm3, [eax + edx * 4 + 48] lea eax, [eax + 64] // - bottom left psubd xmm0, [esi] psubd xmm1, [esi + 16] psubd xmm2, [esi + 32] psubd xmm3, [esi + 48] // + bottom right paddd xmm0, [esi + edx * 4] paddd xmm1, [esi + edx * 4 + 16] paddd xmm2, [esi + edx * 4 + 32] paddd xmm3, [esi + edx * 4 + 48] lea esi, [esi + 64] cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area cvtdq2ps xmm1, xmm1 mulps xmm0, xmm4 mulps xmm1, xmm4 cvtdq2ps xmm2, xmm2 cvtdq2ps xmm3, xmm3 mulps xmm2, xmm4 mulps xmm3, xmm4 cvtps2dq xmm0, xmm0 cvtps2dq xmm1, xmm1 cvtps2dq xmm2, xmm2 cvtps2dq xmm3, xmm3 packssdw xmm0, xmm1 packssdw xmm2, xmm3 packuswb xmm0, xmm2 movdqu [edi], xmm0 lea edi, [edi + 16] sub ecx, 4 jge l4 l4b: add ecx, 4 - 1 jl l1b // 1 pixel loop l1: movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] lea eax, [eax + 16] psubd xmm0, [esi] paddd xmm0, [esi + edx * 4] lea esi, [esi + 16] cvtdq2ps xmm0, xmm0 mulps xmm0, xmm4 cvtps2dq xmm0, xmm0 packssdw xmm0, xmm0 packuswb xmm0, xmm0 movd dword ptr [edi], xmm0 lea edi, [edi + 4] sub ecx, 1 jge l1 l1b: } } #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value. void ComputeCumulativeSumRow_SSE2(const uint8_t* row, int32_t* cumsum, const int32_t* previous_cumsum, int width) { __asm { mov eax, row mov edx, cumsum mov esi, previous_cumsum mov ecx, width pxor xmm0, xmm0 pxor xmm1, xmm1 sub ecx, 4 jl l4b test edx, 15 jne l4b // 4 pixel loop l4: movdqu xmm2, [eax] // 4 argb pixels 16 bytes. lea eax, [eax + 16] movdqa xmm4, xmm2 punpcklbw xmm2, xmm1 movdqa xmm3, xmm2 punpcklwd xmm2, xmm1 punpckhwd xmm3, xmm1 punpckhbw xmm4, xmm1 movdqa xmm5, xmm4 punpcklwd xmm4, xmm1 punpckhwd xmm5, xmm1 paddd xmm0, xmm2 movdqu xmm2, [esi] // previous row above. paddd xmm2, xmm0 paddd xmm0, xmm3 movdqu xmm3, [esi + 16] paddd xmm3, xmm0 paddd xmm0, xmm4 movdqu xmm4, [esi + 32] paddd xmm4, xmm0 paddd xmm0, xmm5 movdqu xmm5, [esi + 48] lea esi, [esi + 64] paddd xmm5, xmm0 movdqu [edx], xmm2 movdqu [edx + 16], xmm3 movdqu [edx + 32], xmm4 movdqu [edx + 48], xmm5 lea edx, [edx + 64] sub ecx, 4 jge l4 l4b: add ecx, 4 - 1 jl l1b // 1 pixel loop l1: movd xmm2, dword ptr [eax] // 1 argb pixel lea eax, [eax + 4] punpcklbw xmm2, xmm1 punpcklwd xmm2, xmm1 paddd xmm0, xmm2 movdqu xmm2, [esi] lea esi, [esi + 16] paddd xmm2, xmm0 movdqu [edx], xmm2 lea edx, [edx + 16] sub ecx, 1 jge l1 l1b: } } #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, int src_argb_stride, uint8_t* dst_argb, const float* uv_dudv, int width) { __asm { push esi push edi mov eax, [esp + 12] // src_argb mov esi, [esp + 16] // stride mov edx, [esp + 20] // dst_argb mov ecx, [esp + 24] // pointer to uv_dudv movq xmm2, qword ptr [ecx] // uv movq xmm7, qword ptr [ecx + 8] // dudv mov ecx, [esp + 28] // width shl esi, 16 // 4, stride add esi, 4 movd xmm5, esi sub ecx, 4 jl l4b // setup for 4 pixel loop pshufd xmm7, xmm7, 0x44 // dup dudv pshufd xmm5, xmm5, 0 // dup 4, stride movdqa xmm0, xmm2 // x0, y0, x1, y1 addps xmm0, xmm7 movlhps xmm2, xmm0 movdqa xmm4, xmm7 addps xmm4, xmm4 // dudv *= 2 movdqa xmm3, xmm2 // x2, y2, x3, y3 addps xmm3, xmm4 addps xmm4, xmm4 // dudv *= 4 // 4 pixel loop l4: cvttps2dq xmm0, xmm2 // x, y float to int first 2 cvttps2dq xmm1, xmm3 // x, y float to int next 2 packssdw xmm0, xmm1 // x, y as 8 shorts pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. movd esi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd edi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd xmm1, [eax + esi] // read pixel 0 movd xmm6, [eax + edi] // read pixel 1 punpckldq xmm1, xmm6 // combine pixel 0 and 1 addps xmm2, xmm4 // x, y += dx, dy first 2 movq qword ptr [edx], xmm1 movd esi, xmm0 pshufd xmm0, xmm0, 0x39 // shift right movd edi, xmm0 movd xmm6, [eax + esi] // read pixel 2 movd xmm0, [eax + edi] // read pixel 3 punpckldq xmm6, xmm0 // combine pixel 2 and 3 addps xmm3, xmm4 // x, y += dx, dy next 2 movq qword ptr 8[edx], xmm6 lea edx, [edx + 16] sub ecx, 4 jge l4 l4b: add ecx, 4 - 1 jl l1b // 1 pixel loop l1: cvttps2dq xmm0, xmm2 // x, y float to int packssdw xmm0, xmm0 // x, y as shorts pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride addps xmm2, xmm7 // x, y += dx, dy movd esi, xmm0 movd xmm0, [eax + esi] // copy a pixel movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 jge l1 l1b: pop edi pop esi ret } } #endif // HAS_ARGBAFFINEROW_SSE2 #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { __asm { push esi push edi mov edi, [esp + 8 + 4] // dst_ptr mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) // Dispatch to specialized filters if applicable. cmp eax, 0 je xloop100 // 0 / 256. Blend 100 / 0. sub edi, esi cmp eax, 128 je xloop50 // 128 /256 is 0.50. Blend 50 / 50. vmovd xmm0, eax // high fraction 0..255 neg eax add eax, 256 vmovd xmm5, eax // low fraction 256..1 vpunpcklbw xmm5, xmm5, xmm0 vpunpcklwd xmm5, xmm5, xmm5 vbroadcastss ymm5, xmm5 mov eax, 0x80808080 // 128b for bias and rounding. vmovd xmm4, eax vbroadcastss ymm4, xmm4 xloop: vmovdqu ymm0, [esi] vmovdqu ymm2, [esi + edx] vpunpckhbw ymm1, ymm0, ymm2 // mutates vpunpcklbw ymm0, ymm0, ymm2 vpsubb ymm1, ymm1, ymm4 // bias to signed image vpsubb ymm0, ymm0, ymm4 vpmaddubsw ymm1, ymm5, ymm1 vpmaddubsw ymm0, ymm5, ymm0 vpaddw ymm1, ymm1, ymm4 // unbias and round vpaddw ymm0, ymm0, ymm4 vpsrlw ymm1, ymm1, 8 vpsrlw ymm0, ymm0, 8 vpackuswb ymm0, ymm0, ymm1 // unmutates vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] sub ecx, 32 jg xloop jmp xloop99 // Blend 50 / 50. xloop50: vmovdqu ymm0, [esi] vpavgb ymm0, ymm0, [esi + edx] vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] sub ecx, 32 jg xloop50 jmp xloop99 // Blend 100 / 0 - Copy row unchanged. xloop100: rep movsb xloop99: pop edi pop esi vzeroupper ret } } #endif // HAS_INTERPOLATEROW_AVX2 // Bilinear filter 16x2 -> 16x1 // TODO(fbarchard): Consider allowing 256 using memcpy. __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { __asm { push esi push edi mov edi, [esp + 8 + 4] // dst_ptr mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) sub edi, esi // Dispatch to specialized filters if applicable. cmp eax, 0 je xloop100 // 0 /256. Blend 100 / 0. cmp eax, 128 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. movd xmm0, eax // high fraction 0..255 neg eax add eax, 256 movd xmm5, eax // low fraction 255..1 punpcklbw xmm5, xmm0 punpcklwd xmm5, xmm5 pshufd xmm5, xmm5, 0 mov eax, 0x80808080 // 128 for biasing image to signed. movd xmm4, eax pshufd xmm4, xmm4, 0x00 xloop: movdqu xmm0, [esi] movdqu xmm2, [esi + edx] movdqu xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 psubb xmm0, xmm4 // bias image by -128 psubb xmm1, xmm4 movdqa xmm2, xmm5 movdqa xmm3, xmm5 pmaddubsw xmm2, xmm0 pmaddubsw xmm3, xmm1 paddw xmm2, xmm4 paddw xmm3, xmm4 psrlw xmm2, 8 psrlw xmm3, 8 packuswb xmm2, xmm3 movdqu [esi + edi], xmm2 lea esi, [esi + 16] sub ecx, 16 jg xloop jmp xloop99 // Blend 50 / 50. xloop50: movdqu xmm0, [esi] movdqu xmm1, [esi + edx] pavgb xmm0, xmm1 movdqu [esi + edi], xmm0 lea esi, [esi + 16] sub ecx, 16 jg xloop50 jmp xloop99 // Blend 100 / 0 - Copy row unchanged. xloop100: movdqu xmm0, [esi] movdqu [esi + edi], xmm0 lea esi, [esi + 16] sub ecx, 16 jg xloop100 xloop99: pop edi pop esi ret } } // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler movdqu xmm5, [ecx] mov ecx, [esp + 16] // width wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] pshufb xmm0, xmm5 pshufb xmm1, xmm5 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg wloop ret } } #ifdef HAS_ARGBSHUFFLEROW_AVX2 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { __asm { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. mov ecx, [esp + 16] // width wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] vpshufb ymm0, ymm0, ymm5 vpshufb ymm1, ymm1, ymm5 vmovdqu [edx], ymm0 vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] sub ecx, 16 jg wloop vzeroupper ret } } #endif // HAS_ARGBSHUFFLEROW_AVX2 // YUY2 - Macro-pixel = 2 image pixels // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... // UYVY - Macro-pixel = 2 image pixels // U0Y0V0Y1 __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_frame, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_y mov esi, [esp + 8 + 8] // src_u mov edx, [esp + 8 + 12] // src_v mov edi, [esp + 8 + 16] // dst_frame mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: movq xmm2, qword ptr [esi] // U movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] punpcklbw xmm2, xmm3 // UV movdqu xmm0, [eax] // Y lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm2 // YUYV punpckhbw xmm1, xmm2 movdqu [edi], xmm0 movdqu [edi + 16], xmm1 lea edi, [edi + 32] sub ecx, 16 jg convertloop pop edi pop esi ret } } __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_frame, int width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_y mov esi, [esp + 8 + 8] // src_u mov edx, [esp + 8 + 12] // src_v mov edi, [esp + 8 + 16] // dst_frame mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: movq xmm2, qword ptr [esi] // U movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] punpcklbw xmm2, xmm3 // UV movdqu xmm0, [eax] // Y movdqa xmm1, xmm2 lea eax, [eax + 16] punpcklbw xmm1, xmm0 // UYVY punpckhbw xmm2, xmm0 movdqu [edi], xmm1 movdqu [edi + 16], xmm2 lea edi, [edi + 32] sub ecx, 16 jg convertloop pop edi pop esi ret } } #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { __asm { push esi mov eax, [esp + 4 + 4] /* src_argb */ mov edx, [esp + 4 + 8] /* dst_argb */ mov esi, [esp + 4 + 12] /* poly */ mov ecx, [esp + 4 + 16] /* width */ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. // 2 pixel loop. convertloop: // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel movq xmm0, qword ptr [eax] // BGRABGRA lea eax, [eax + 8] punpcklbw xmm0, xmm3 movdqa xmm4, xmm0 punpcklwd xmm0, xmm3 // pixel 0 punpckhwd xmm4, xmm3 // pixel 1 cvtdq2ps xmm0, xmm0 // 4 floats cvtdq2ps xmm4, xmm4 movdqa xmm1, xmm0 // X movdqa xmm5, xmm4 mulps xmm0, [esi + 16] // C1 * X mulps xmm4, [esi + 16] addps xmm0, [esi] // result = C0 + C1 * X addps xmm4, [esi] movdqa xmm2, xmm1 movdqa xmm6, xmm5 mulps xmm2, xmm1 // X * X mulps xmm6, xmm5 mulps xmm1, xmm2 // X * X * X mulps xmm5, xmm6 mulps xmm2, [esi + 32] // C2 * X * X mulps xmm6, [esi + 32] mulps xmm1, [esi + 48] // C3 * X * X * X mulps xmm5, [esi + 48] addps xmm0, xmm2 // result += C2 * X * X addps xmm4, xmm6 addps xmm0, xmm1 // result += C3 * X * X * X addps xmm4, xmm5 cvttps2dq xmm0, xmm0 cvttps2dq xmm4, xmm4 packuswb xmm0, xmm4 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 lea edx, [edx + 8] sub ecx, 2 jg convertloop pop esi ret } } #endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { __asm { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_argb */ mov ecx, [esp + 12] /* poly */ vbroadcastf128 ymm4, [ecx] // C0 vbroadcastf128 ymm5, [ecx + 16] // C1 vbroadcastf128 ymm6, [ecx + 32] // C2 vbroadcastf128 ymm7, [ecx + 48] // C3 mov ecx, [esp + 16] /* width */ // 2 pixel loop. convertloop: vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels lea eax, [eax + 8] vcvtdq2ps ymm0, ymm0 // X 8 floats vmulps ymm2, ymm0, ymm0 // X * X vmulps ymm3, ymm0, ymm7 // C3 * X vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X vcvttps2dq ymm0, ymm0 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 vmovq qword ptr [edx], xmm0 lea edx, [edx + 8] sub ecx, 2 jg convertloop vzeroupper ret } } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 #ifdef HAS_HALFFLOATROW_SSE2 static float kExpBias = 1.9259299444e-34f; __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, uint16_t* dst, float scale, int width) { __asm { mov eax, [esp + 4] /* src */ mov edx, [esp + 8] /* dst */ movd xmm4, dword ptr [esp + 12] /* scale */ mov ecx, [esp + 16] /* width */ mulss xmm4, kExpBias pshufd xmm4, xmm4, 0 pxor xmm5, xmm5 sub edx, eax // 8 pixel loop. convertloop: movdqu xmm2, xmmword ptr [eax] // 8 shorts add eax, 16 movdqa xmm3, xmm2 punpcklwd xmm2, xmm5 cvtdq2ps xmm2, xmm2 // convert 8 ints to floats punpckhwd xmm3, xmm5 cvtdq2ps xmm3, xmm3 mulps xmm2, xmm4 mulps xmm3, xmm4 psrld xmm2, 13 psrld xmm3, 13 packssdw xmm2, xmm3 movdqu [eax + edx - 16], xmm2 sub ecx, 8 jg convertloop ret } } #endif // HAS_HALFFLOATROW_SSE2 #ifdef HAS_HALFFLOATROW_AVX2 __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, uint16_t* dst, float scale, int width) { __asm { mov eax, [esp + 4] /* src */ mov edx, [esp + 8] /* dst */ movd xmm4, dword ptr [esp + 12] /* scale */ mov ecx, [esp + 16] /* width */ vmulss xmm4, xmm4, kExpBias vbroadcastss ymm4, xmm4 vpxor ymm5, ymm5, ymm5 sub edx, eax // 16 pixel loop. convertloop: vmovdqu ymm2, [eax] // 16 shorts add eax, 32 vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints vpunpcklwd ymm2, ymm2, ymm5 vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats vcvtdq2ps ymm2, ymm2 vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. vmulps ymm2, ymm2, ymm4 vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate vpsrld ymm2, ymm2, 13 vpackssdw ymm2, ymm2, ymm3 vmovdqu [eax + edx - 32], ymm2 sub ecx, 16 jg convertloop vzeroupper ret } } #endif // HAS_HALFFLOATROW_AVX2 #ifdef HAS_HALFFLOATROW_F16C __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, uint16_t* dst, float scale, int width) { __asm { mov eax, [esp + 4] /* src */ mov edx, [esp + 8] /* dst */ vbroadcastss ymm4, [esp + 12] /* scale */ mov ecx, [esp + 16] /* width */ sub edx, eax // 16 pixel loop. convertloop: vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts add eax, 32 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats vcvtdq2ps ymm3, ymm3 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 vmulps ymm3, ymm3, ymm4 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate vcvtps2ph xmm3, ymm3, 3 vmovdqu [eax + edx + 32], xmm2 vmovdqu [eax + edx + 32 + 16], xmm3 sub ecx, 16 jg convertloop vzeroupper ret } } #endif // HAS_HALFFLOATROW_F16C #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] /* dst_argb */ mov esi, [esp + 4 + 8] /* table_argb */ mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. convertloop: movzx edx, byte ptr [eax] lea eax, [eax + 4] movzx edx, byte ptr [esi + edx * 4] mov byte ptr [eax - 4], dl movzx edx, byte ptr [eax - 4 + 1] movzx edx, byte ptr [esi + edx * 4 + 1] mov byte ptr [eax - 4 + 1], dl movzx edx, byte ptr [eax - 4 + 2] movzx edx, byte ptr [esi + edx * 4 + 2] mov byte ptr [eax - 4 + 2], dl movzx edx, byte ptr [eax - 4 + 3] movzx edx, byte ptr [esi + edx * 4 + 3] mov byte ptr [eax - 4 + 3], dl dec ecx jg convertloop pop esi ret } } #endif // HAS_ARGBCOLORTABLEROW_X86 #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, const uint8_t* table_argb, int width) { __asm { push esi mov eax, [esp + 4 + 4] /* dst_argb */ mov esi, [esp + 4 + 8] /* table_argb */ mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. convertloop: movzx edx, byte ptr [eax] lea eax, [eax + 4] movzx edx, byte ptr [esi + edx * 4] mov byte ptr [eax - 4], dl movzx edx, byte ptr [eax - 4 + 1] movzx edx, byte ptr [esi + edx * 4 + 1] mov byte ptr [eax - 4 + 1], dl movzx edx, byte ptr [eax - 4 + 2] movzx edx, byte ptr [esi + edx * 4 + 2] mov byte ptr [eax - 4 + 2], dl dec ecx jg convertloop pop esi ret } } #endif // HAS_RGBCOLORTABLEROW_X86 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width, const uint8_t* luma, uint32_t lumacoeff) { __asm { push esi push edi mov eax, [esp + 8 + 4] /* src_argb */ mov edi, [esp + 8 + 8] /* dst_argb */ mov ecx, [esp + 8 + 12] /* width */ movd xmm2, dword ptr [esp + 8 + 16] // luma table movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff pshufd xmm2, xmm2, 0 pshufd xmm3, xmm3, 0 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 psllw xmm4, 8 pxor xmm5, xmm5 // 4 pixel loop. convertloop: movdqu xmm0, xmmword ptr [eax] // generate luma ptr pmaddubsw xmm0, xmm3 phaddw xmm0, xmm0 pand xmm0, xmm4 // mask out low bits punpcklwd xmm0, xmm5 paddd xmm0, xmm2 // add table base movd esi, xmm0 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 movzx edx, byte ptr [eax] movzx edx, byte ptr [esi + edx] mov byte ptr [edi], dl movzx edx, byte ptr [eax + 1] movzx edx, byte ptr [esi + edx] mov byte ptr [edi + 1], dl movzx edx, byte ptr [eax + 2] movzx edx, byte ptr [esi + edx] mov byte ptr [edi + 2], dl movzx edx, byte ptr [eax + 3] // copy alpha. mov byte ptr [edi + 3], dl movd esi, xmm0 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 movzx edx, byte ptr [eax + 4] movzx edx, byte ptr [esi + edx] mov byte ptr [edi + 4], dl movzx edx, byte ptr [eax + 5] movzx edx, byte ptr [esi + edx] mov byte ptr [edi + 5], dl movzx edx, byte ptr [eax + 6] movzx edx, byte ptr [esi + edx] mov byte ptr [edi + 6], dl movzx edx, byte ptr [eax + 7] // copy alpha. mov byte ptr [edi + 7], dl movd esi, xmm0 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 movzx edx, byte ptr [eax + 8] movzx edx, byte ptr [esi + edx] mov byte ptr [edi + 8], dl movzx edx, byte ptr [eax + 9] movzx edx, byte ptr [esi + edx] mov byte ptr [edi + 9], dl movzx edx, byte ptr [eax + 10] movzx edx, byte ptr [esi + edx] mov byte ptr [edi + 10], dl movzx edx, byte ptr [eax + 11] // copy alpha. mov byte ptr [edi + 11], dl movd esi, xmm0 movzx edx, byte ptr [eax + 12] movzx edx, byte ptr [esi + edx] mov byte ptr [edi + 12], dl movzx edx, byte ptr [eax + 13] movzx edx, byte ptr [esi + edx] mov byte ptr [edi + 13], dl movzx edx, byte ptr [eax + 14] movzx edx, byte ptr [esi + edx] mov byte ptr [edi + 14], dl movzx edx, byte ptr [eax + 15] // copy alpha. mov byte ptr [edi + 15], dl lea eax, [eax + 16] lea edi, [edi + 16] sub ecx, 4 jg convertloop pop edi pop esi ret } } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 #endif // defined(_M_X64) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) libyuv-0.0~git20220104.b91df1a/source/scale.cc000066400000000000000000002350651416500237200204370ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/scale.h" #include #include #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" // For CopyPlane #include "libyuv/row.h" #include "libyuv/scale_row.h" #include "libyuv/scale_uv.h" // For UVScale #ifdef __cplusplus namespace libyuv { extern "C" { #endif static __inline int Abs(int v) { return v >= 0 ? v : -v; } #define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) // Scale plane, 1/2 // This is an optimized version for scaling down a plane to 1/2 of // its original size. static void ScalePlaneDown2(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_ptr, uint8_t* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) = filtering == kFilterNone ? ScaleRowDown2_C : (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C); int row_stride = src_stride * 2; (void)src_width; (void)src_height; if (!filtering) { src_ptr += src_stride; // Point to odd rows. src_stride = 0; } #if defined(HAS_SCALEROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON : ScaleRowDown2Box_Any_NEON); if (IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON : (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON : ScaleRowDown2Box_NEON); } } #endif #if defined(HAS_SCALEROWDOWN2_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 : ScaleRowDown2Box_Any_SSSE3); if (IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 : ScaleRowDown2Box_SSSE3); } } #endif #if defined(HAS_SCALEROWDOWN2_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 : ScaleRowDown2Box_Any_AVX2); if (IS_ALIGNED(dst_width, 32)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 : (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 : ScaleRowDown2Box_AVX2); } } #endif #if defined(HAS_SCALEROWDOWN2_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_MMI : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MMI : ScaleRowDown2Box_Any_MMI); if (IS_ALIGNED(dst_width, 8)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MMI : (filtering == kFilterLinear ? ScaleRowDown2Linear_MMI : ScaleRowDown2Box_MMI); } } #endif #if defined(HAS_SCALEROWDOWN2_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_MSA : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA : ScaleRowDown2Box_Any_MSA); if (IS_ALIGNED(dst_width, 32)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA : (filtering == kFilterLinear ? ScaleRowDown2Linear_MSA : ScaleRowDown2Box_MSA); } } #endif if (filtering == kFilterLinear) { src_stride = 0; } // TODO(fbarchard): Loop through source height to allow odd height. for (y = 0; y < dst_height; ++y) { ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); src_ptr += row_stride; dst_ptr += dst_stride; } } static void ScalePlaneDown2_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width) = filtering == kFilterNone ? ScaleRowDown2_16_C : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C : ScaleRowDown2Box_16_C); int row_stride = src_stride * 2; (void)src_width; (void)src_height; if (!filtering) { src_ptr += src_stride; // Point to odd rows. src_stride = 0; } #if defined(HAS_SCALEROWDOWN2_16_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON; } #endif #if defined(HAS_SCALEROWDOWN2_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 : ScaleRowDown2Box_16_SSE2); } #endif #if defined(HAS_SCALEROWDOWN2_16_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_MMI : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_MMI : ScaleRowDown2Box_16_MMI); } #endif if (filtering == kFilterLinear) { src_stride = 0; } // TODO(fbarchard): Loop through source height to allow odd height. for (y = 0; y < dst_height; ++y) { ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); src_ptr += row_stride; dst_ptr += dst_stride; } } // Scale plane, 1/4 // This is an optimized version for scaling down a plane to 1/4 of // its original size. static void ScalePlaneDown4(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_ptr, uint8_t* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; int row_stride = src_stride * 4; (void)src_width; (void)src_height; if (!filtering) { src_ptr += src_stride * 2; // Point to row 2. src_stride = 0; } #if defined(HAS_SCALEROWDOWN4_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; } } #endif #if defined(HAS_SCALEROWDOWN4_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3; if (IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3; } } #endif #if defined(HAS_SCALEROWDOWN4_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; if (IS_ALIGNED(dst_width, 16)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; } } #endif #if defined(HAS_SCALEROWDOWN4_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_Any_MMI : ScaleRowDown4_Any_MMI; if (IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_MMI : ScaleRowDown4_MMI; } } #endif #if defined(HAS_SCALEROWDOWN4_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA; if (IS_ALIGNED(dst_width, 16)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA; } } #endif if (filtering == kFilterLinear) { src_stride = 0; } for (y = 0; y < dst_height; ++y) { ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); src_ptr += row_stride; dst_ptr += dst_stride; } } static void ScalePlaneDown4_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C; int row_stride = src_stride * 4; (void)src_width; (void)src_height; if (!filtering) { src_ptr += src_stride * 2; // Point to row 2. src_stride = 0; } #if defined(HAS_SCALEROWDOWN4_16_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON; } #endif #if defined(HAS_SCALEROWDOWN4_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2; } #endif #if defined(HAS_SCALEROWDOWN4_16_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_MMI : ScaleRowDown4_16_MMI; } #endif if (filtering == kFilterLinear) { src_stride = 0; } for (y = 0; y < dst_height; ++y) { ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); src_ptr += row_stride; dst_ptr += dst_stride; } } // Scale plane down, 3/4 static void ScalePlaneDown34(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_ptr, uint8_t* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; (void)src_width; (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_C; ScaleRowDown34_1 = ScaleRowDown34_C; } else { ScaleRowDown34_0 = ScaleRowDown34_0_Box_C; ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; } #if defined(HAS_SCALEROWDOWN34_NEON) if (TestCpuFlag(kCpuHasNEON)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_Any_NEON; ScaleRowDown34_1 = ScaleRowDown34_Any_NEON; } else { ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON; ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON; } if (dst_width % 24 == 0) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_NEON; ScaleRowDown34_1 = ScaleRowDown34_NEON; } else { ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; } } } #endif #if defined(HAS_SCALEROWDOWN34_MMI) if (TestCpuFlag(kCpuHasMMI)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_Any_MMI; ScaleRowDown34_1 = ScaleRowDown34_Any_MMI; if (dst_width % 24 == 0) { ScaleRowDown34_0 = ScaleRowDown34_MMI; ScaleRowDown34_1 = ScaleRowDown34_MMI; } } } #endif #if defined(HAS_SCALEROWDOWN34_MSA) if (TestCpuFlag(kCpuHasMSA)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_Any_MSA; ScaleRowDown34_1 = ScaleRowDown34_Any_MSA; } else { ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA; ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA; } if (dst_width % 48 == 0) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_MSA; ScaleRowDown34_1 = ScaleRowDown34_MSA; } else { ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA; ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA; } } } #endif #if defined(HAS_SCALEROWDOWN34_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3; } else { ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3; } if (dst_width % 24 == 0) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_SSSE3; } else { ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; } } } #endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; } // Remainder 1 or 2 rows with last row vertically unfiltered if ((dst_height % 3) == 2) { ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); } else if ((dst_height % 3) == 1) { ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); } } static void ScalePlaneDown34_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width); void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; (void)src_width; (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_16_C; ScaleRowDown34_1 = ScaleRowDown34_16_C; } else { ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C; ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C; } #if defined(HAS_SCALEROWDOWN34_16_NEON) if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_16_NEON; ScaleRowDown34_1 = ScaleRowDown34_16_NEON; } else { ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON; ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON; } } #endif #if defined(HAS_SCALEROWDOWN34_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3; } else { ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3; } } #endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; } // Remainder 1 or 2 rows with last row vertically unfiltered if ((dst_height % 3) == 2) { ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride; dst_ptr += dst_stride; ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); } else if ((dst_height % 3) == 1) { ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); } } // Scale plane, 3/8 // This is an optimized version for scaling down a plane to 3/8 // of its original size. // // Uses box filter arranges like this // aaabbbcc -> abc // aaabbbcc def // aaabbbcc ghi // dddeeeff // dddeeeff // dddeeeff // ggghhhii // ggghhhii // Boxes are 3x3, 2x3, 3x2 and 2x2 static void ScalePlaneDown38(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_ptr, uint8_t* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; assert(dst_width % 3 == 0); (void)src_width; (void)src_height; if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_C; ScaleRowDown38_2 = ScaleRowDown38_C; } else { ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; } #if defined(HAS_SCALEROWDOWN38_NEON) if (TestCpuFlag(kCpuHasNEON)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_Any_NEON; ScaleRowDown38_2 = ScaleRowDown38_Any_NEON; } else { ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON; ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON; } if (dst_width % 12 == 0) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_NEON; ScaleRowDown38_2 = ScaleRowDown38_NEON; } else { ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; } } } #endif #if defined(HAS_SCALEROWDOWN38_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3; } else { ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3; } if (dst_width % 12 == 0 && !filtering) { ScaleRowDown38_3 = ScaleRowDown38_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_SSSE3; } if (dst_width % 6 == 0 && filtering) { ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; } } #endif #if defined(HAS_SCALEROWDOWN38_MSA) if (TestCpuFlag(kCpuHasMSA)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_Any_MSA; ScaleRowDown38_2 = ScaleRowDown38_Any_MSA; } else { ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA; ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA; } if (dst_width % 12 == 0) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_MSA; ScaleRowDown38_2 = ScaleRowDown38_MSA; } else { ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA; ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA; } } } #endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 3; dst_ptr += dst_stride; ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 3; dst_ptr += dst_stride; ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; } // Remainder 1 or 2 rows with last row vertically unfiltered if ((dst_height % 3) == 2) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 3; dst_ptr += dst_stride; ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); } else if ((dst_height % 3) == 1) { ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); } } static void ScalePlaneDown38_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr, enum FilterMode filtering) { int y; void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width); void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width); const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; (void)src_width; (void)src_height; assert(dst_width % 3 == 0); if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_16_C; ScaleRowDown38_2 = ScaleRowDown38_16_C; } else { ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C; ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C; } #if defined(HAS_SCALEROWDOWN38_16_NEON) if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_16_NEON; ScaleRowDown38_2 = ScaleRowDown38_16_NEON; } else { ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON; ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON; } } #endif #if defined(HAS_SCALEROWDOWN38_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { if (!filtering) { ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3; } else { ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3; } } #endif for (y = 0; y < dst_height - 2; y += 3) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 3; dst_ptr += dst_stride; ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 3; dst_ptr += dst_stride; ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 2; dst_ptr += dst_stride; } // Remainder 1 or 2 rows with last row vertically unfiltered if ((dst_height % 3) == 2) { ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); src_ptr += src_stride * 3; dst_ptr += dst_stride; ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); } else if ((dst_height % 3) == 1) { ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); } } #define MIN1(x) ((x) < 1 ? 1 : (x)) static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) { uint32_t sum = 0u; int x; assert(iboxwidth > 0); for (x = 0; x < iboxwidth; ++x) { sum += src_ptr[x]; } return sum; } static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) { uint32_t sum = 0u; int x; assert(iboxwidth > 0); for (x = 0; x < iboxwidth; ++x) { sum += src_ptr[x]; } return sum; } static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, const uint16_t* src_ptr, uint8_t* dst_ptr) { int i; int scaletbl[2]; int minboxwidth = dx >> 16; int boxwidth; scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); for (i = 0; i < dst_width; ++i) { int ix = x >> 16; x += dx; boxwidth = MIN1((x >> 16) - ix); *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >> 16; } } static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, const uint32_t* src_ptr, uint16_t* dst_ptr) { int i; int scaletbl[2]; int minboxwidth = dx >> 16; int boxwidth; scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); for (i = 0; i < dst_width; ++i) { int ix = x >> 16; x += dx; boxwidth = MIN1((x >> 16) - ix); *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >> 16; } } static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int dx, const uint16_t* src_ptr, uint8_t* dst_ptr) { int scaleval = 65536 / boxheight; int i; (void)dx; src_ptr += (x >> 16); for (i = 0; i < dst_width; ++i) { *dst_ptr++ = src_ptr[i] * scaleval >> 16; } } static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, const uint16_t* src_ptr, uint8_t* dst_ptr) { int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; x >>= 16; for (i = 0; i < dst_width; ++i) { *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; x += boxwidth; } } static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, const uint32_t* src_ptr, uint16_t* dst_ptr) { int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; for (i = 0; i < dst_width; ++i) { *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16; x += boxwidth; } } // Scale plane down to any dimensions, with interpolation. // (boxfilter). // // Same method as SimpleScale, which is fixed point, outputting // one pixel of destination using fixed point (16.16) to step // through source, sampling a box of pixel with simple // averaging. static void ScalePlaneBox(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_ptr, uint8_t* dst_ptr) { int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; const int max_y = (src_height << 16); ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, &dx, &dy); src_width = Abs(src_width); { // Allocate a row buffer of uint16_t. align_buffer_64(row16, src_width * 2); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, const uint16_t* src_ptr, uint8_t* dst_ptr) = (dx & 0xffff) ? ScaleAddCols2_C : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) = ScaleAddRow_C; #if defined(HAS_SCALEADDROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleAddRow = ScaleAddRow_Any_SSE2; if (IS_ALIGNED(src_width, 16)) { ScaleAddRow = ScaleAddRow_SSE2; } } #endif #if defined(HAS_SCALEADDROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ScaleAddRow = ScaleAddRow_Any_AVX2; if (IS_ALIGNED(src_width, 32)) { ScaleAddRow = ScaleAddRow_AVX2; } } #endif #if defined(HAS_SCALEADDROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleAddRow = ScaleAddRow_Any_NEON; if (IS_ALIGNED(src_width, 16)) { ScaleAddRow = ScaleAddRow_NEON; } } #endif #if defined(HAS_SCALEADDROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleAddRow = ScaleAddRow_Any_MMI; if (IS_ALIGNED(src_width, 8)) { ScaleAddRow = ScaleAddRow_MMI; } } #endif #if defined(HAS_SCALEADDROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleAddRow = ScaleAddRow_Any_MSA; if (IS_ALIGNED(src_width, 16)) { ScaleAddRow = ScaleAddRow_MSA; } } #endif for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; const uint8_t* src = src_ptr + iy * (int64_t)src_stride; y += dy; if (y > max_y) { y = max_y; } boxheight = MIN1((y >> 16) - iy); memset(row16, 0, src_width * 2); for (k = 0; k < boxheight; ++k) { ScaleAddRow(src, (uint16_t*)(row16), src_width); src += src_stride; } ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr); dst_ptr += dst_stride; } free_aligned_buffer_64(row16); } } static void ScalePlaneBox_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr) { int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; const int max_y = (src_height << 16); ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, &dx, &dy); src_width = Abs(src_width); { // Allocate a row buffer of uint32_t. align_buffer_64(row32, src_width * 4); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, const uint32_t* src_ptr, uint16_t* dst_ptr) = (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C; void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr, int src_width) = ScaleAddRow_16_C; #if defined(HAS_SCALEADDROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { ScaleAddRow = ScaleAddRow_16_SSE2; } #endif #if defined(HAS_SCALEADDROW_16_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(src_width, 4)) { ScaleAddRow = ScaleAddRow_16_MMI; } #endif for (j = 0; j < dst_height; ++j) { int boxheight; int iy = y >> 16; const uint16_t* src = src_ptr + iy * (int64_t)src_stride; y += dy; if (y > max_y) { y = max_y; } boxheight = MIN1((y >> 16) - iy); memset(row32, 0, src_width * 4); for (k = 0; k < boxheight; ++k) { ScaleAddRow(src, (uint32_t*)(row32), src_width); src += src_stride; } ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr); dst_ptr += dst_stride; } free_aligned_buffer_64(row32); } } // Scale plane down with bilinear interpolation. void ScalePlaneBilinearDown(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_ptr, uint8_t* dst_ptr, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // Allocate a row buffer. align_buffer_64(row, src_width); const int max_y = (src_height - 1) << 16; int j; void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(src_width, 32)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_MMI; } } #endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; if (IS_ALIGNED(src_width, 32)) { InterpolateRow = InterpolateRow_MSA; } } #endif #if defined(HAS_SCALEFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_SSSE3; } #endif #if defined(HAS_SCALEFILTERCOLS_NEON) if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleFilterCols_NEON; } } #endif #if defined(HAS_SCALEFILTERCOLS_MSA) if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_Any_MSA; if (IS_ALIGNED(dst_width, 16)) { ScaleFilterCols = ScaleFilterCols_MSA; } } #endif if (y > max_y) { y = max_y; } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; const uint8_t* src = src_ptr + yi * (int64_t)src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { int yf = (y >> 8) & 255; InterpolateRow(row, src, src_stride, src_width, yf); ScaleFilterCols(dst_ptr, row, dst_width, x, dx); } dst_ptr += dst_stride; y += dy; if (y > max_y) { y = max_y; } } free_aligned_buffer_64(row); } void ScalePlaneBilinearDown_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // Allocate a row buffer. align_buffer_64(row, src_width * 2); const int max_y = (src_height - 1) << 16; int j; void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C; void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_16_SSE2; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_16_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_16_SSSE3; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_16_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_16_AVX2; if (IS_ALIGNED(src_width, 32)) { InterpolateRow = InterpolateRow_16_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_16_NEON; if (IS_ALIGNED(src_width, 16)) { InterpolateRow = InterpolateRow_16_NEON; } } #endif #if defined(HAS_SCALEFILTERCOLS_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_16_SSSE3; } #endif if (y > max_y) { y = max_y; } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; const uint16_t* src = src_ptr + yi * (int64_t)src_stride; if (filtering == kFilterLinear) { ScaleFilterCols(dst_ptr, src, dst_width, x, dx); } else { int yf = (y >> 8) & 255; InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf); ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx); } dst_ptr += dst_stride; y += dy; if (y > max_y) { y = max_y; } } free_aligned_buffer_64(row); } // Scale up down with bilinear interpolation. void ScalePlaneBilinearUp(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_ptr, uint8_t* dst_ptr, enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) = filtering ? ScaleFilterCols_C : ScaleCols_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(dst_width, 32)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_C; } #if defined(HAS_SCALEFILTERCOLS_SSSE3) if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_SSSE3; } #endif #if defined(HAS_SCALEFILTERCOLS_NEON) if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleFilterCols_NEON; } } #endif #if defined(HAS_SCALEFILTERCOLS_MSA) if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_Any_MSA; if (IS_ALIGNED(dst_width, 16)) { ScaleFilterCols = ScaleFilterCols_MSA; } } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleColsUp2_SSE2; } #endif #if defined(HAS_SCALECOLS_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleColsUp2_MMI; } #endif } if (y > max_y) { y = max_y; } { int yi = y >> 16; const uint8_t* src = src_ptr + yi * (int64_t)src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; align_buffer_64(row, kRowSize * 2); uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; ScaleFilterCols(rowptr, src, dst_width, x, dx); if (src_height > 1) { src += src_stride; } ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); src += src_stride; for (j = 0; j < dst_height; ++j) { yi = y >> 16; if (yi != lasty) { if (y > max_y) { y = max_y; yi = y >> 16; src = src_ptr + yi * (int64_t)src_stride; } if (yi != lasty) { ScaleFilterCols(rowptr, src, dst_width, x, dx); rowptr += rowstride; rowstride = -rowstride; lasty = yi; src += src_stride; } } if (filtering == kFilterLinear) { InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); } else { int yf = (y >> 8) & 255; InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); } dst_ptr += dst_stride; y += dy; } free_aligned_buffer_64(row); } } // Scale plane, horizontally up by 2 times. // Uses linear filter horizontally, nearest vertically. // This is an optimized version for scaling up a plane to 2 times of // its original width, using linear interpolation. // This is used to scale U and V planes of I422 to I444. void ScalePlaneUp2_Linear(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_ptr, uint8_t* dst_ptr) { void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) = ScaleRowUp2_Linear_Any_C; int i; int y; int dy; // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); #ifdef HAS_SCALEROWUP2LINEAR_SSE2 if (TestCpuFlag(kCpuHasSSE2)) { ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2; } #endif #ifdef HAS_SCALEROWUP2LINEAR_SSSE3 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3; } #endif #ifdef HAS_SCALEROWUP2LINEAR_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2; } #endif #ifdef HAS_SCALEROWUP2LINEAR_NEON if (TestCpuFlag(kCpuHasNEON)) { ScaleRowUp = ScaleRowUp2_Linear_Any_NEON; } #endif if (dst_height == 1) { ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, dst_width); } else { dy = FixedDiv(src_height - 1, dst_height - 1); y = (1 << 15) - 1; for (i = 0; i < dst_height; ++i) { ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); dst_ptr += dst_stride; y += dy; } } } // Scale plane, up by 2 times. // This is an optimized version for scaling up a plane to 2 times of // its original size, using bilinear interpolation. // This is used to scale U and V planes of I420 to I444. void ScalePlaneUp2_Bilinear(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_ptr, uint8_t* dst_ptr) { void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_Any_C; int x; // This function can only scale up by 2 times. assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); #ifdef HAS_SCALEROWUP2BILINEAR_SSE2 if (TestCpuFlag(kCpuHasSSE2)) { Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; } #endif #ifdef HAS_SCALEROWUP2BILINEAR_SSSE3 if (TestCpuFlag(kCpuHasSSSE3)) { Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; } #endif #ifdef HAS_SCALEROWUP2BILINEAR_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; } #endif #ifdef HAS_SCALEROWUP2BILINEAR_NEON if (TestCpuFlag(kCpuHasNEON)) { Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; } #endif Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); dst_ptr += dst_stride; for (x = 0; x < src_height - 1; ++x) { Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); src_ptr += src_stride; // TODO(fbarchard): Test performance of writing one row of destination at a // time. dst_ptr += 2 * dst_stride; } if (!(dst_height & 1)) { Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); } } // Scale at most 14 bit plane, horizontally up by 2 times. // This is an optimized version for scaling up a plane to 2 times of // its original width, using linear interpolation. // stride is in count of uint16_t. // This is used to scale U and V planes of I210 to I410 and I212 to I412. void ScalePlaneUp2_12_Linear(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr) { void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) = ScaleRowUp2_Linear_16_Any_C; int i; int y; int dy; // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); #ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowUp = ScaleRowUp2_Linear_12_Any_SSSE3; } #endif #ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { ScaleRowUp = ScaleRowUp2_Linear_12_Any_AVX2; } #endif #ifdef HAS_SCALEROWUP2LINEAR_12_NEON if (TestCpuFlag(kCpuHasNEON)) { ScaleRowUp = ScaleRowUp2_Linear_12_Any_NEON; } #endif if (dst_height == 1) { ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, dst_width); } else { dy = FixedDiv(src_height - 1, dst_height - 1); y = (1 << 15) - 1; for (i = 0; i < dst_height; ++i) { ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); dst_ptr += dst_stride; y += dy; } } } // Scale at most 12 bit plane, up by 2 times. // This is an optimized version for scaling up a plane to 2 times of // its original size, using bilinear interpolation. // stride is in count of uint16_t. // This is used to scale U and V planes of I010 to I410 and I012 to I412. void ScalePlaneUp2_12_Bilinear(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr) { void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C; int x; // This function can only scale up by 2 times. assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); #ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 if (TestCpuFlag(kCpuHasSSSE3)) { Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_SSSE3; } #endif #ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_AVX2; } #endif #ifdef HAS_SCALEROWUP2BILINEAR_12_NEON if (TestCpuFlag(kCpuHasNEON)) { Scale2RowUp = ScaleRowUp2_Bilinear_12_Any_NEON; } #endif Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); dst_ptr += dst_stride; for (x = 0; x < src_height - 1; ++x) { Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); src_ptr += src_stride; dst_ptr += 2 * dst_stride; } if (!(dst_height & 1)) { Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); } } void ScalePlaneUp2_16_Linear(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr) { void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) = ScaleRowUp2_Linear_16_Any_C; int i; int y; int dy; // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); #ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 if (TestCpuFlag(kCpuHasSSE2)) { ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; } #endif #ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2; } #endif #ifdef HAS_SCALEROWUP2LINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON; } #endif if (dst_height == 1) { ScaleRowUp(src_ptr + ((src_height - 1) / 2) * (int64_t)src_stride, dst_ptr, dst_width); } else { dy = FixedDiv(src_height - 1, dst_height - 1); y = (1 << 15) - 1; for (i = 0; i < dst_height; ++i) { ScaleRowUp(src_ptr + (y >> 16) * (int64_t)src_stride, dst_ptr, dst_width); dst_ptr += dst_stride; y += dy; } } } void ScalePlaneUp2_16_Bilinear(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr) { void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = ScaleRowUp2_Bilinear_16_Any_C; int x; // This function can only scale up by 2 times. assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); #ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 if (TestCpuFlag(kCpuHasSSSE3)) { Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSSE3; } #endif #ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2; } #endif #ifdef HAS_SCALEROWUP2BILINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON; } #endif Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); dst_ptr += dst_stride; for (x = 0; x < src_height - 1; ++x) { Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); src_ptr += src_stride; dst_ptr += 2 * dst_stride; } if (!(dst_height & 1)) { Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); } } void ScalePlaneBilinearUp_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr, enum FilterMode filtering) { int j; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; const int max_y = (src_height - 1) << 16; void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, &dx, &dy); src_width = Abs(src_width); #if defined(HAS_INTERPOLATEROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_16_SSE2; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_16_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_16_SSSE3; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_16_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_16_AVX2; if (IS_ALIGNED(dst_width, 32)) { InterpolateRow = InterpolateRow_16_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_16_NEON; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_16_NEON; } } #endif if (filtering && src_width >= 32768) { ScaleFilterCols = ScaleFilterCols64_16_C; } #if defined(HAS_SCALEFILTERCOLS_16_SSSE3) if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleFilterCols = ScaleFilterCols_16_SSSE3; } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_16_C; #if defined(HAS_SCALECOLS_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleColsUp2_16_SSE2; } #endif #if defined(HAS_SCALECOLS_16_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { ScaleFilterCols = ScaleColsUp2_16_MMI; } #endif } if (y > max_y) { y = max_y; } { int yi = y >> 16; const uint16_t* src = src_ptr + yi * (int64_t)src_stride; // Allocate 2 row buffers. const int kRowSize = (dst_width + 31) & ~31; align_buffer_64(row, kRowSize * 4); uint16_t* rowptr = (uint16_t*)row; int rowstride = kRowSize; int lasty = yi; ScaleFilterCols(rowptr, src, dst_width, x, dx); if (src_height > 1) { src += src_stride; } ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); src += src_stride; for (j = 0; j < dst_height; ++j) { yi = y >> 16; if (yi != lasty) { if (y > max_y) { y = max_y; yi = y >> 16; src = src_ptr + yi * (int64_t)src_stride; } if (yi != lasty) { ScaleFilterCols(rowptr, src, dst_width, x, dx); rowptr += rowstride; rowstride = -rowstride; lasty = yi; src += src_stride; } } if (filtering == kFilterLinear) { InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); } else { int yf = (y >> 8) & 255; InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); } dst_ptr += dst_stride; y += dy; } free_aligned_buffer_64(row); } } // Scale Plane to/from any dimensions, without interpolation. // Fixed point math is used for performance: The upper 16 bits // of x and dx is the integer part of the source position and // the lower 16 bits are the fixed decimal part. static void ScalePlaneSimple(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_ptr, uint8_t* dst_ptr) { int i; void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) = ScaleCols_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, &dx, &dy); src_width = Abs(src_width); if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleCols = ScaleColsUp2_SSE2; } #endif #if defined(HAS_SCALECOLS_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { ScaleCols = ScaleColsUp2_MMI; } #endif } for (i = 0; i < dst_height; ++i) { ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, dx); dst_ptr += dst_stride; y += dy; } } static void ScalePlaneSimple_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr) { int i; void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) = ScaleCols_16_C; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y, &dx, &dy); src_width = Abs(src_width); if (src_width * 2 == dst_width && x < 0x8000) { ScaleCols = ScaleColsUp2_16_C; #if defined(HAS_SCALECOLS_16_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleCols = ScaleColsUp2_16_SSE2; } #endif #if defined(HAS_SCALECOLS_16_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 8)) { ScaleCols = ScaleColsUp2_16_MMI; } #endif } for (i = 0; i < dst_height; ++i) { ScaleCols(dst_ptr, src_ptr + (y >> 16) * (int64_t)src_stride, dst_width, x, dx); dst_ptr += dst_stride; y += dy; } } // Scale a plane. // This function dispatches to a specialized scaler based on scale factor. LIBYUV_API void ScalePlane(const uint8_t* src, int src_stride, int src_width, int src_height, uint8_t* dst, int dst_stride, int dst_width, int dst_height, enum FilterMode filtering) { // Simplify filtering when possible. filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, filtering); // Negative height means invert the image. if (src_height < 0) { src_height = -src_height; src = src + (src_height - 1) * (int64_t)src_stride; src_stride = -src_stride; } // Use specialized scales to improve performance for common resolutions. // For example, all the 1/2 scalings will use ScalePlaneDown2() if (dst_width == src_width && dst_height == src_height) { // Straight copy. CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); return; } if (dst_width == src_width && filtering != kFilterBox) { int dy = FixedDiv(src_height, dst_height); // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, 0, 0, dy, 1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { // Scale down. if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { // optimized, 3/4 ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } if (2 * dst_width == src_width && 2 * dst_height == src_height) { // optimized, 1/2 ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } // 3/8 rounded up for odd sized chroma height. if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { // optimized, 3/8 ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } if (4 * dst_width == src_width && 4 * dst_height == src_height && (filtering == kFilterBox || filtering == kFilterNone)) { // optimized, 1/4 ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } } if (filtering == kFilterBox && dst_height * 2 < src_height) { ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && (filtering == kFilterBilinear || filtering == kFilterBox)) { ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } if (filtering && dst_height > src_height) { ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } if (filtering) { ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); } LIBYUV_API void ScalePlane_16(const uint16_t* src, int src_stride, int src_width, int src_height, uint16_t* dst, int dst_stride, int dst_width, int dst_height, enum FilterMode filtering) { // Simplify filtering when possible. filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, filtering); // Negative height means invert the image. if (src_height < 0) { src_height = -src_height; src = src + (src_height - 1) * (int64_t)src_stride; src_stride = -src_stride; } // Use specialized scales to improve performance for common resolutions. // For example, all the 1/2 scalings will use ScalePlaneDown2() if (dst_width == src_width && dst_height == src_height) { // Straight copy. CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height); return; } if (dst_width == src_width && filtering != kFilterBox) { int dy = FixedDiv(src_height, dst_height); // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, 0, 0, dy, 1, filtering); return; } if (dst_width <= Abs(src_width) && dst_height <= src_height) { // Scale down. if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) { // optimized, 3/4 ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } if (2 * dst_width == src_width && 2 * dst_height == src_height) { // optimized, 1/2 ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } // 3/8 rounded up for odd sized chroma height. if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) { // optimized, 3/8 ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } if (4 * dst_width == src_width && 4 * dst_height == src_height && (filtering == kFilterBox || filtering == kFilterNone)) { // optimized, 1/4 ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } } if (filtering == kFilterBox && dst_height * 2 < src_height) { ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && (filtering == kFilterBilinear || filtering == kFilterBox)) { ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } if (filtering && dst_height > src_height) { ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } if (filtering) { ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); } LIBYUV_API void ScalePlane_12(const uint16_t* src, int src_stride, int src_width, int src_height, uint16_t* dst, int dst_stride, int dst_width, int dst_height, enum FilterMode filtering) { // Simplify filtering when possible. filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, filtering); // Negative height means invert the image. if (src_height < 0) { src_height = -src_height; src = src + (src_height - 1) * (int64_t)src_stride; src_stride = -src_stride; } if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) { ScalePlaneUp2_12_Linear(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && (filtering == kFilterBilinear || filtering == kFilterBox)) { ScalePlaneUp2_12_Bilinear(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst); return; } ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride, dst_width, dst_height, filtering); } // Scale an I420 image. // This function in turn calls a scaling function for each plane. LIBYUV_API int I420Scale(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_width, int src_height, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering) { int src_halfwidth = SUBSAMPLE(src_width, 1, 1); int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; } ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, dst_width, dst_height, filtering); ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, filtering); ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, filtering); return 0; } LIBYUV_API int I420Scale_16(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, int src_width, int src_height, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering) { int src_halfwidth = SUBSAMPLE(src_width, 1, 1); int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; } ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, dst_width, dst_height, filtering); ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, filtering); ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, filtering); return 0; } LIBYUV_API int I420Scale_12(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, int src_width, int src_height, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering) { int src_halfwidth = SUBSAMPLE(src_width, 1, 1); int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; } ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, dst_width, dst_height, filtering); ScalePlane_12(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, filtering); ScalePlane_12(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, filtering); return 0; } // Scale an I444 image. // This function in turn calls a scaling function for each plane. LIBYUV_API int I444Scale(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, int src_width, int src_height, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, int dst_stride_u, uint8_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering) { if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; } ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, dst_width, dst_height, filtering); ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, dst_width, dst_height, filtering); ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, dst_width, dst_height, filtering); return 0; } LIBYUV_API int I444Scale_16(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, int src_width, int src_height, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering) { if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; } ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, dst_width, dst_height, filtering); ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, dst_width, dst_height, filtering); ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, dst_width, dst_height, filtering); return 0; } LIBYUV_API int I444Scale_12(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, const uint16_t* src_v, int src_stride_v, int src_width, int src_height, uint16_t* dst_y, int dst_stride_y, uint16_t* dst_u, int dst_stride_u, uint16_t* dst_v, int dst_stride_v, int dst_width, int dst_height, enum FilterMode filtering) { if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; } ScalePlane_12(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, dst_width, dst_height, filtering); ScalePlane_12(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u, dst_width, dst_height, filtering); ScalePlane_12(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v, dst_width, dst_height, filtering); return 0; } // Scale an NV12 image. // This function in turn calls a scaling function for each plane. LIBYUV_API int NV12Scale(const uint8_t* src_y, int src_stride_y, const uint8_t* src_uv, int src_stride_uv, int src_width, int src_height, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_uv, int dst_stride_uv, int dst_width, int dst_height, enum FilterMode filtering) { int src_halfwidth = SUBSAMPLE(src_width, 1, 1); int src_halfheight = SUBSAMPLE(src_height, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || !dst_uv || dst_width <= 0 || dst_height <= 0) { return -1; } ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y, dst_width, dst_height, filtering); UVScale(src_uv, src_stride_uv, src_halfwidth, src_halfheight, dst_uv, dst_stride_uv, dst_halfwidth, dst_halfheight, filtering); return 0; } // Deprecated api LIBYUV_API int Scale(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, int src_stride_y, int src_stride_u, int src_stride_v, int src_width, int src_height, uint8_t* dst_y, uint8_t* dst_u, uint8_t* dst_v, int dst_stride_y, int dst_stride_u, int dst_stride_v, int dst_width, int dst_height, LIBYUV_BOOL interpolate) { return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, src_width, src_height, dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width, dst_height, interpolate ? kFilterBox : kFilterNone); } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/scale_any.cc000066400000000000000000000746771416500237200213200ustar00rootroot00000000000000/* * Copyright 2015 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include // For memset/memcpy #include "libyuv/scale.h" #include "libyuv/scale_row.h" #include "libyuv/basic_types.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Fixed scale down. // Mask may be non-power of 2, so use MOD #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ int dst_width) { \ int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */ \ int n = dst_width - r; \ if (n > 0) { \ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ } \ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ dst_ptr + n * BPP, r); \ } // Fixed scale down for odd source width. Used by I420Blend subsampling. // Since dst_width is (width + 1) / 2, this function scales one less pixel // and copies the last pixel. #define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \ int dst_width) { \ int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */ \ int n = (dst_width - 1) - r; \ if (n > 0) { \ SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ } \ SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ dst_ptr + n * BPP, r + 1); \ } #ifdef HAS_SCALEROWDOWN2_SSSE3 SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15) SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3, ScaleRowDown2Linear_C, 2, 1, 15) SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C, 2, 1, 15) SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_Odd_C, 2, 1, 15) #endif #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 SDANY(ScaleUVRowDown2Box_Any_SSSE3, ScaleUVRowDown2Box_SSSE3, ScaleUVRowDown2Box_C, 2, 2, 3) #endif #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 SDANY(ScaleUVRowDown2Box_Any_AVX2, ScaleUVRowDown2Box_AVX2, ScaleUVRowDown2Box_C, 2, 2, 7) #endif #ifdef HAS_SCALEROWDOWN2_AVX2 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2, ScaleRowDown2Linear_C, 2, 1, 31) SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C, 2, 1, 31) SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C, 2, 1, 31) #endif #ifdef HAS_SCALEROWDOWN2_NEON SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON, ScaleRowDown2Linear_C, 2, 1, 15) SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON, ScaleRowDown2Box_C, 2, 1, 15) SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON, ScaleRowDown2Box_Odd_C, 2, 1, 15) #endif #ifdef HAS_SCALEUVROWDOWN2BOX_NEON SDANY(ScaleUVRowDown2Box_Any_NEON, ScaleUVRowDown2Box_NEON, ScaleUVRowDown2Box_C, 2, 2, 7) #endif #ifdef HAS_SCALEROWDOWN2_MSA SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31) SDANY(ScaleRowDown2Linear_Any_MSA, ScaleRowDown2Linear_MSA, ScaleRowDown2Linear_C, 2, 1, 31) SDANY(ScaleRowDown2Box_Any_MSA, ScaleRowDown2Box_MSA, ScaleRowDown2Box_C, 2, 1, 31) #endif #ifdef HAS_SCALEROWDOWN2_MMI SDANY(ScaleRowDown2_Any_MMI, ScaleRowDown2_MMI, ScaleRowDown2_C, 2, 1, 7) SDANY(ScaleRowDown2Linear_Any_MMI, ScaleRowDown2Linear_MMI, ScaleRowDown2Linear_C, 2, 1, 7) SDANY(ScaleRowDown2Box_Any_MMI, ScaleRowDown2Box_MMI, ScaleRowDown2Box_C, 2, 1, 7) SDODD(ScaleRowDown2Box_Odd_MMI, ScaleRowDown2Box_MMI, ScaleRowDown2Box_Odd_C, 2, 1, 7) #endif #ifdef HAS_SCALEROWDOWN4_SSSE3 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C, 4, 1, 7) #endif #ifdef HAS_SCALEROWDOWN4_AVX2 SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C, 4, 1, 15) #endif #ifdef HAS_SCALEROWDOWN4_NEON SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C, 4, 1, 7) #endif #ifdef HAS_SCALEROWDOWN4_MSA SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15) SDANY(ScaleRowDown4Box_Any_MSA, ScaleRowDown4Box_MSA, ScaleRowDown4Box_C, 4, 1, 15) #endif #ifdef HAS_SCALEROWDOWN4_MMI SDANY(ScaleRowDown4_Any_MMI, ScaleRowDown4_MMI, ScaleRowDown4_C, 4, 1, 7) SDANY(ScaleRowDown4Box_Any_MMI, ScaleRowDown4Box_MMI, ScaleRowDown4Box_C, 4, 1, 7) #endif #ifdef HAS_SCALEROWDOWN34_SSSE3 SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, ScaleRowDown34_C, 4 / 3, 1, 23) SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3, ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3, ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) #endif #ifdef HAS_SCALEROWDOWN34_NEON SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON, ScaleRowDown34_C, 4 / 3, 1, 23) SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON, ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON, ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) #endif #ifdef HAS_SCALEROWDOWN34_MSA SDANY(ScaleRowDown34_Any_MSA, ScaleRowDown34_MSA, ScaleRowDown34_C, 4 / 3, 1, 47) SDANY(ScaleRowDown34_0_Box_Any_MSA, ScaleRowDown34_0_Box_MSA, ScaleRowDown34_0_Box_C, 4 / 3, 1, 47) SDANY(ScaleRowDown34_1_Box_Any_MSA, ScaleRowDown34_1_Box_MSA, ScaleRowDown34_1_Box_C, 4 / 3, 1, 47) #endif #ifdef HAS_SCALEROWDOWN34_MMI SDANY(ScaleRowDown34_Any_MMI, ScaleRowDown34_MMI, ScaleRowDown34_C, 4 / 3, 1, 23) #endif #ifdef HAS_SCALEROWDOWN38_SSSE3 SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3, ScaleRowDown38_C, 8 / 3, 1, 11) SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3, ScaleRowDown38_3_Box_C, 8 / 3, 1, 5) SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3, ScaleRowDown38_2_Box_C, 8 / 3, 1, 5) #endif #ifdef HAS_SCALEROWDOWN38_NEON SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON, ScaleRowDown38_C, 8 / 3, 1, 11) SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON, ScaleRowDown38_3_Box_C, 8 / 3, 1, 11) SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON, ScaleRowDown38_2_Box_C, 8 / 3, 1, 11) #endif #ifdef HAS_SCALEROWDOWN38_MSA SDANY(ScaleRowDown38_Any_MSA, ScaleRowDown38_MSA, ScaleRowDown38_C, 8 / 3, 1, 11) SDANY(ScaleRowDown38_3_Box_Any_MSA, ScaleRowDown38_3_Box_MSA, ScaleRowDown38_3_Box_C, 8 / 3, 1, 11) SDANY(ScaleRowDown38_2_Box_Any_MSA, ScaleRowDown38_2_Box_MSA, ScaleRowDown38_2_Box_C, 8 / 3, 1, 11) #endif #ifdef HAS_SCALEARGBROWDOWN2_SSE2 SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2, ScaleARGBRowDown2_C, 2, 4, 3) SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2, ScaleARGBRowDown2Linear_C, 2, 4, 3) SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2, ScaleARGBRowDown2Box_C, 2, 4, 3) #endif #ifdef HAS_SCALEARGBROWDOWN2_NEON SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON, ScaleARGBRowDown2_C, 2, 4, 7) SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON, ScaleARGBRowDown2Linear_C, 2, 4, 7) SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON, ScaleARGBRowDown2Box_C, 2, 4, 7) #endif #ifdef HAS_SCALEARGBROWDOWN2_MSA SDANY(ScaleARGBRowDown2_Any_MSA, ScaleARGBRowDown2_MSA, ScaleARGBRowDown2_C, 2, 4, 3) SDANY(ScaleARGBRowDown2Linear_Any_MSA, ScaleARGBRowDown2Linear_MSA, ScaleARGBRowDown2Linear_C, 2, 4, 3) SDANY(ScaleARGBRowDown2Box_Any_MSA, ScaleARGBRowDown2Box_MSA, ScaleARGBRowDown2Box_C, 2, 4, 3) #endif #ifdef HAS_SCALEARGBROWDOWN2_MMI SDANY(ScaleARGBRowDown2_Any_MMI, ScaleARGBRowDown2_MMI, ScaleARGBRowDown2_C, 2, 4, 1) SDANY(ScaleARGBRowDown2Linear_Any_MMI, ScaleARGBRowDown2Linear_MMI, ScaleARGBRowDown2Linear_C, 2, 4, 1) SDANY(ScaleARGBRowDown2Box_Any_MMI, ScaleARGBRowDown2Box_MMI, ScaleARGBRowDown2Box_C, 2, 4, 1) #endif #undef SDANY // Scale down by even scale factor. #define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \ uint8_t* dst_ptr, int dst_width) { \ int r = dst_width & MASK; \ int n = dst_width & ~MASK; \ if (n > 0) { \ SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ } \ SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx, \ dst_ptr + n * BPP, r); \ } #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2, ScaleARGBRowDownEven_C, 4, 3) SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2, ScaleARGBRowDownEvenBox_C, 4, 3) #endif #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON, ScaleARGBRowDownEven_C, 4, 3) SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON, ScaleARGBRowDownEvenBox_C, 4, 3) #endif #ifdef HAS_SCALEARGBROWDOWNEVEN_MSA SDAANY(ScaleARGBRowDownEven_Any_MSA, ScaleARGBRowDownEven_MSA, ScaleARGBRowDownEven_C, 4, 3) SDAANY(ScaleARGBRowDownEvenBox_Any_MSA, ScaleARGBRowDownEvenBox_MSA, ScaleARGBRowDownEvenBox_C, 4, 3) #endif #ifdef HAS_SCALEARGBROWDOWNEVEN_MMI SDAANY(ScaleARGBRowDownEven_Any_MMI, ScaleARGBRowDownEven_MMI, ScaleARGBRowDownEven_C, 4, 1) SDAANY(ScaleARGBRowDownEvenBox_Any_MMI, ScaleARGBRowDownEvenBox_MMI, ScaleARGBRowDownEvenBox_C, 4, 1) #endif #ifdef HAS_SCALEUVROWDOWNEVEN_NEON SDAANY(ScaleUVRowDownEven_Any_NEON, ScaleUVRowDownEven_NEON, ScaleUVRowDownEven_C, 2, 3) #endif #ifdef SASIMDONLY // This also works and uses memcpy and SIMD instead of C, but is slower on ARM // Add rows box filter scale down. Using macro from row_any #define SAROW(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int width) { \ SIMD_ALIGNED(uint16_t dst_temp[32]); \ SIMD_ALIGNED(uint8_t src_temp[32]); \ memset(dst_temp, 0, 32 * 2); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ ANY_SIMD(src_ptr, dst_ptr, n); \ } \ memcpy(src_temp, src_ptr + n * SBPP, r * SBPP); \ memcpy(dst_temp, dst_ptr + n * BPP, r * BPP); \ ANY_SIMD(src_temp, dst_temp, MASK + 1); \ memcpy(dst_ptr + n * BPP, dst_temp, r * BPP); \ } #ifdef HAS_SCALEADDROW_SSE2 SAROW(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, 1, 2, 15) #endif #ifdef HAS_SCALEADDROW_AVX2 SAROW(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, 1, 2, 31) #endif #ifdef HAS_SCALEADDROW_NEON SAROW(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, 1, 2, 15) #endif #ifdef HAS_SCALEADDROW_MSA SAROW(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, 1, 2, 15) #endif #ifdef HAS_SCALEADDROW_MMI SAROW(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, 1, 2, 7) #endif #undef SAANY #else // Add rows box filter scale down. #define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \ int n = src_width & ~MASK; \ if (n > 0) { \ SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ } \ SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ } #ifdef HAS_SCALEADDROW_SSE2 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) #endif #ifdef HAS_SCALEADDROW_AVX2 SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) #endif #ifdef HAS_SCALEADDROW_NEON SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) #endif #ifdef HAS_SCALEADDROW_MSA SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15) #endif #ifdef HAS_SCALEADDROW_MMI SAANY(ScaleAddRow_Any_MMI, ScaleAddRow_MMI, ScaleAddRow_C, 7) #endif #undef SAANY #endif // SASIMDONLY // Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols #define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \ int dx) { \ int r = dst_width & MASK; \ int n = dst_width & ~MASK; \ if (n > 0) { \ TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ } \ TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx); \ } #ifdef HAS_SCALEFILTERCOLS_NEON CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) #endif #ifdef HAS_SCALEFILTERCOLS_MSA CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15) #endif #ifdef HAS_SCALEARGBCOLS_NEON CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) #endif #ifdef HAS_SCALEARGBCOLS_MSA CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3) #endif #ifdef HAS_SCALEARGBCOLS_MMI CANY(ScaleARGBCols_Any_MMI, ScaleARGBCols_MMI, ScaleARGBCols_C, 4, 0) #endif #ifdef HAS_SCALEARGBFILTERCOLS_NEON CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON, ScaleARGBFilterCols_C, 4, 3) #endif #ifdef HAS_SCALEARGBFILTERCOLS_MSA CANY(ScaleARGBFilterCols_Any_MSA, ScaleARGBFilterCols_MSA, ScaleARGBFilterCols_C, 4, 7) #endif #undef CANY // Scale up horizontally 2 times using linear filter. #define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ int work_width = (dst_width - 1) & ~1; \ int r = work_width & MASK; \ int n = work_width & ~MASK; \ dst_ptr[0] = src_ptr[0]; \ if (work_width > 0) { \ if (n != 0) { \ SIMD(src_ptr, dst_ptr + 1, n); \ } \ C(src_ptr + (n / 2), dst_ptr + n + 1, r); \ } \ dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1]; \ } // Even the C versions need to be wrapped, because boundary pixels have to // be handled differently SUH2LANY(ScaleRowUp2_Linear_Any_C, ScaleRowUp2_Linear_C, ScaleRowUp2_Linear_C, 0, uint8_t) SUH2LANY(ScaleRowUp2_Linear_16_Any_C, ScaleRowUp2_Linear_16_C, ScaleRowUp2_Linear_16_C, 0, uint16_t) #ifdef HAS_SCALEROWUP2LINEAR_SSE2 SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, ScaleRowUp2_Linear_SSE2, ScaleRowUp2_Linear_C, 15, uint8_t) #endif #ifdef HAS_SCALEROWUP2LINEAR_SSSE3 SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, ScaleRowUp2_Linear_SSSE3, ScaleRowUp2_Linear_C, 15, uint8_t) #endif #ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 SUH2LANY(ScaleRowUp2_Linear_12_Any_SSSE3, ScaleRowUp2_Linear_12_SSSE3, ScaleRowUp2_Linear_16_C, 15, uint16_t) #endif #ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, ScaleRowUp2_Linear_16_SSE2, ScaleRowUp2_Linear_16_C, 7, uint16_t) #endif #ifdef HAS_SCALEROWUP2LINEAR_AVX2 SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, ScaleRowUp2_Linear_AVX2, ScaleRowUp2_Linear_C, 31, uint8_t) #endif #ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 SUH2LANY(ScaleRowUp2_Linear_12_Any_AVX2, ScaleRowUp2_Linear_12_AVX2, ScaleRowUp2_Linear_16_C, 31, uint16_t) #endif #ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, ScaleRowUp2_Linear_16_AVX2, ScaleRowUp2_Linear_16_C, 15, uint16_t) #endif #ifdef HAS_SCALEROWUP2LINEAR_NEON SUH2LANY(ScaleRowUp2_Linear_Any_NEON, ScaleRowUp2_Linear_NEON, ScaleRowUp2_Linear_C, 15, uint8_t) #endif #ifdef HAS_SCALEROWUP2LINEAR_12_NEON SUH2LANY(ScaleRowUp2_Linear_12_Any_NEON, ScaleRowUp2_Linear_12_NEON, ScaleRowUp2_Linear_16_C, 15, uint16_t) #endif #ifdef HAS_SCALEROWUP2LINEAR_16_NEON SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON, ScaleRowUp2_Linear_16_NEON, ScaleRowUp2_Linear_16_C, 15, uint16_t) #endif #undef SUH2LANY // Scale up 2 times using bilinear filter. // This function produces 2 rows at a time. #define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ ptrdiff_t dst_stride, int dst_width) { \ int work_width = (dst_width - 1) & ~1; \ int r = work_width & MASK; \ int n = work_width & ~MASK; \ const PTYPE* sa = src_ptr; \ const PTYPE* sb = src_ptr + src_stride; \ PTYPE* da = dst_ptr; \ PTYPE* db = dst_ptr + dst_stride; \ da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ if (work_width > 0) { \ if (n != 0) { \ SIMD(sa, sb - sa, da + 1, db - da, n); \ } \ C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \ } \ da[dst_width - 1] = \ (3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2] + 2) >> 2; \ db[dst_width - 1] = \ (sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2] + 2) >> 2; \ } SU2BLANY(ScaleRowUp2_Bilinear_Any_C, ScaleRowUp2_Bilinear_C, ScaleRowUp2_Bilinear_C, 0, uint8_t) SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C, ScaleRowUp2_Bilinear_16_C, ScaleRowUp2_Bilinear_16_C, 0, uint16_t) #ifdef HAS_SCALEROWUP2BILINEAR_SSE2 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, ScaleRowUp2_Bilinear_SSE2, ScaleRowUp2_Bilinear_C, 15, uint8_t) #endif #ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 SU2BLANY(ScaleRowUp2_Bilinear_12_Any_SSSE3, ScaleRowUp2_Bilinear_12_SSSE3, ScaleRowUp2_Bilinear_16_C, 15, uint16_t) #endif #ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3, ScaleRowUp2_Bilinear_16_SSE2, ScaleRowUp2_Bilinear_16_C, 7, uint16_t) #endif #ifdef HAS_SCALEROWUP2BILINEAR_SSSE3 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, ScaleRowUp2_Bilinear_SSSE3, ScaleRowUp2_Bilinear_C, 15, uint8_t) #endif #ifdef HAS_SCALEROWUP2BILINEAR_AVX2 SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2, ScaleRowUp2_Bilinear_AVX2, ScaleRowUp2_Bilinear_C, 31, uint8_t) #endif #ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 SU2BLANY(ScaleRowUp2_Bilinear_12_Any_AVX2, ScaleRowUp2_Bilinear_12_AVX2, ScaleRowUp2_Bilinear_16_C, 15, uint16_t) #endif #ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2, ScaleRowUp2_Bilinear_16_AVX2, ScaleRowUp2_Bilinear_16_C, 15, uint16_t) #endif #ifdef HAS_SCALEROWUP2BILINEAR_NEON SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON, ScaleRowUp2_Bilinear_NEON, ScaleRowUp2_Bilinear_C, 15, uint8_t) #endif #ifdef HAS_SCALEROWUP2BILINEAR_12_NEON SU2BLANY(ScaleRowUp2_Bilinear_12_Any_NEON, ScaleRowUp2_Bilinear_12_NEON, ScaleRowUp2_Bilinear_16_C, 15, uint16_t) #endif #ifdef HAS_SCALEROWUP2BILINEAR_16_NEON SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON, ScaleRowUp2_Bilinear_16_NEON, ScaleRowUp2_Bilinear_16_C, 7, uint16_t) #endif #undef SU2BLANY // Scale bi-planar plane up horizontally 2 times using linear filter. #define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ int work_width = (dst_width - 1) & ~1; \ int r = work_width & MASK; \ int n = work_width & ~MASK; \ dst_ptr[0] = src_ptr[0]; \ dst_ptr[1] = src_ptr[1]; \ if (work_width > 0) { \ if (n != 0) { \ SIMD(src_ptr, dst_ptr + 2, n); \ } \ C(src_ptr + n, dst_ptr + 2 * n + 2, r); \ } \ dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \ dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \ } SBUH2LANY(ScaleUVRowUp2_Linear_Any_C, ScaleUVRowUp2_Linear_C, ScaleUVRowUp2_Linear_C, 0, uint8_t) SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C, ScaleUVRowUp2_Linear_16_C, ScaleUVRowUp2_Linear_16_C, 0, uint16_t) #ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3, ScaleUVRowUp2_Linear_SSSE3, ScaleUVRowUp2_Linear_C, 7, uint8_t) #endif #ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2, ScaleUVRowUp2_Linear_AVX2, ScaleUVRowUp2_Linear_C, 15, uint8_t) #endif #ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2 SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE2, ScaleUVRowUp2_Linear_16_SSE2, ScaleUVRowUp2_Linear_16_C, 3, uint16_t) #endif #ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2 SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2, ScaleUVRowUp2_Linear_16_AVX2, ScaleUVRowUp2_Linear_16_C, 7, uint16_t) #endif #ifdef HAS_SCALEUVROWUP2LINEAR_NEON SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, ScaleUVRowUp2_Linear_NEON, ScaleUVRowUp2_Linear_C, 15, uint8_t) #endif #ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON, ScaleUVRowUp2_Linear_16_NEON, ScaleUVRowUp2_Linear_16_C, 15, uint16_t) #endif #undef SBUH2LANY // Scale bi-planar plane up 2 times using bilinear filter. // This function produces 2 rows at a time. #define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ ptrdiff_t dst_stride, int dst_width) { \ int work_width = (dst_width - 1) & ~1; \ int r = work_width & MASK; \ int n = work_width & ~MASK; \ const PTYPE* sa = src_ptr; \ const PTYPE* sb = src_ptr + src_stride; \ PTYPE* da = dst_ptr; \ PTYPE* db = dst_ptr + dst_stride; \ da[0] = (3 * sa[0] + sb[0] + 2) >> 2; \ db[0] = (sa[0] + 3 * sb[0] + 2) >> 2; \ da[1] = (3 * sa[1] + sb[1] + 2) >> 2; \ db[1] = (sa[1] + 3 * sb[1] + 2) >> 2; \ if (work_width > 0) { \ if (n != 0) { \ SIMD(sa, sb - sa, da + 2, db - da, n); \ } \ C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \ } \ da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \ sb[((dst_width + 1) & ~1) - 2] + 2) >> \ 2; \ db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \ 3 * sb[((dst_width + 1) & ~1) - 2] + 2) >> \ 2; \ da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \ sb[((dst_width + 1) & ~1) - 1] + 2) >> \ 2; \ db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \ 3 * sb[((dst_width + 1) & ~1) - 1] + 2) >> \ 2; \ } SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C, ScaleUVRowUp2_Bilinear_C, ScaleUVRowUp2_Bilinear_C, 0, uint8_t) SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C, ScaleUVRowUp2_Bilinear_16_C, ScaleUVRowUp2_Bilinear_16_C, 0, uint16_t) #ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3, ScaleUVRowUp2_Bilinear_SSSE3, ScaleUVRowUp2_Bilinear_C, 7, uint8_t) #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2 SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2, ScaleUVRowUp2_Bilinear_AVX2, ScaleUVRowUp2_Bilinear_C, 15, uint8_t) #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2 SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE2, ScaleUVRowUp2_Bilinear_16_SSE2, ScaleUVRowUp2_Bilinear_16_C, 7, uint16_t) #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2 SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2, ScaleUVRowUp2_Bilinear_16_AVX2, ScaleUVRowUp2_Bilinear_16_C, 7, uint16_t) #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_NEON SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON, ScaleUVRowUp2_Bilinear_NEON, ScaleUVRowUp2_Bilinear_C, 7, uint8_t) #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON, ScaleUVRowUp2_Bilinear_16_NEON, ScaleUVRowUp2_Bilinear_16_C, 7, uint16_t) #endif #undef SBU2BLANY #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/scale_argb.cc000066400000000000000000001077151416500237200214320ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/scale.h" #include #include #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" // For CopyARGB #include "libyuv/row.h" #include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif static __inline int Abs(int v) { return v >= 0 ? v : -v; } // ScaleARGB ARGB, 1/2 // This is an optimized version for scaling down a ARGB to 1/2 of // its original size. static void ScaleARGBDown2(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_argb, uint8_t* dst_argb, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; int row_stride = src_stride * (dy >> 16); void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) = filtering == kFilterNone ? ScaleARGBRowDown2_C : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C : ScaleARGBRowDown2Box_C); (void)src_width; (void)src_height; (void)dx; assert(dx == 65536 * 2); // Test scale factor of 2. assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. // Advance to odd row, even column. if (filtering == kFilterBilinear) { src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4; } else { src_argb += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 4; } #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 : ScaleARGBRowDown2Box_Any_SSE2); if (IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : ScaleARGBRowDown2Box_SSE2); } } #endif #if defined(HAS_SCALEARGBROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON : ScaleARGBRowDown2Box_Any_NEON); if (IS_ALIGNED(dst_width, 8)) { ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON : ScaleARGBRowDown2Box_NEON); } } #endif #if defined(HAS_SCALEARGBROWDOWN2_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_MMI : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MMI : ScaleARGBRowDown2Box_Any_MMI); if (IS_ALIGNED(dst_width, 2)) { ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_MMI : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MMI : ScaleARGBRowDown2Box_MMI); } } #endif #if defined(HAS_SCALEARGBROWDOWN2_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_MSA : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA : ScaleARGBRowDown2Box_Any_MSA); if (IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_MSA : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA : ScaleARGBRowDown2Box_MSA); } } #endif if (filtering == kFilterLinear) { src_stride = 0; } for (j = 0; j < dst_height; ++j) { ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width); src_argb += row_stride; dst_argb += dst_stride; } } // ScaleARGB ARGB, 1/4 // This is an optimized version for scaling down a ARGB to 1/4 of // its original size. static void ScaleARGBDown4Box(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_argb, uint8_t* dst_argb, int x, int dx, int y, int dy) { int j; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 2 * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); int row_stride = src_stride * (dy >> 16); void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C; // Advance to odd row, even column. src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4; (void)src_width; (void)src_height; (void)dx; assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; } } #endif #if defined(HAS_SCALEARGBROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; } } #endif for (j = 0; j < dst_height; ++j) { ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize, dst_width * 2); ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width); src_argb += row_stride; dst_argb += dst_stride; } free_aligned_buffer_64(row); } // ScaleARGB ARGB Even // This is an optimized version for scaling down a ARGB to even // multiple of its original size. static void ScaleARGBDownEven(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_argb, uint8_t* dst_argb, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; int col_step = dx >> 16; int row_stride = (dy >> 16) * (int64_t)src_stride; void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride, int src_step, uint8_t* dst_argb, int dst_width) = filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; (void)src_width; (void)src_height; assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); src_argb += (y >> 16) * (int64_t)src_stride + (x >> 16) * 4; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 : ScaleARGBRowDownEven_Any_SSE2; if (IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2; } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON : ScaleARGBRowDownEven_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON; } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MMI : ScaleARGBRowDownEven_Any_MMI; if (IS_ALIGNED(dst_width, 2)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_MMI : ScaleARGBRowDownEven_MMI; } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA : ScaleARGBRowDownEven_Any_MSA; if (IS_ALIGNED(dst_width, 4)) { ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA; } } #endif if (filtering == kFilterLinear) { src_stride = 0; } for (j = 0; j < dst_height; ++j) { ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width); src_argb += row_stride; dst_argb += dst_stride; } } // Scale ARGB down with bilinear interpolation. static void ScaleARGBBilinearDown(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_argb, uint8_t* dst_argb, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C; int64_t xlast = x + (int64_t)(dst_width - 1) * dx; int64_t xl = (dx >= 0) ? x : xlast; int64_t xr = (dx >= 0) ? xlast : x; int clip_src_width; xl = (xl >> 16) & ~3; // Left edge aligned. xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. if (xr > src_width) { xr = src_width; } clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4. src_argb += xl * 4; x -= (int)(xl << 16); #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(clip_src_width, 16)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(clip_src_width, 32)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(clip_src_width, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; if (IS_ALIGNED(clip_src_width, 32)) { InterpolateRow = InterpolateRow_MSA; } } #endif #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif #if defined(HAS_SCALEARGBFILTERCOLS_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; } } #endif #if defined(HAS_SCALEARGBFILTERCOLS_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; if (IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; } } #endif // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // Allocate a row of ARGB. { align_buffer_64(row, clip_src_width * 4); const int max_y = (src_height - 1) << 16; if (y > max_y) { y = max_y; } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; const uint8_t* src = src_argb + yi * (int64_t)src_stride; if (filtering == kFilterLinear) { ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx); } else { int yf = (y >> 8) & 255; InterpolateRow(row, src, src_stride, clip_src_width, yf); ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx); } dst_argb += dst_stride; y += dy; if (y > max_y) { y = max_y; } } free_aligned_buffer_64(row); } } // Scale ARGB up with bilinear interpolation. static void ScaleARGBBilinearUp(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_argb, uint8_t* dst_argb, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; const int max_y = (src_height - 1) << 16; #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(dst_width, 8)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { InterpolateRow = InterpolateRow_NEON; } } #endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; if (IS_ALIGNED(dst_width, 2)) { InterpolateRow = InterpolateRow_MMI; } } #endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; if (IS_ALIGNED(dst_width, 8)) { InterpolateRow = InterpolateRow_MSA; } } #endif if (src_width >= 32768) { ScaleARGBFilterCols = filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif #if defined(HAS_SCALEARGBFILTERCOLS_NEON) if (filtering && TestCpuFlag(kCpuHasNEON)) { ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; } } #endif #if defined(HAS_SCALEARGBFILTERCOLS_MSA) if (filtering && TestCpuFlag(kCpuHasMSA)) { ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; if (IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; } } #endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; } #endif #if defined(HAS_SCALEARGBCOLS_NEON) if (!filtering && TestCpuFlag(kCpuHasNEON)) { ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBCols_NEON; } } #endif #if defined(HAS_SCALEARGBCOLS_MMI) if (!filtering && TestCpuFlag(kCpuHasMMI)) { ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; if (IS_ALIGNED(dst_width, 1)) { ScaleARGBFilterCols = ScaleARGBCols_MMI; } } #endif #if defined(HAS_SCALEARGBCOLS_MSA) if (!filtering && TestCpuFlag(kCpuHasMSA)) { ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; if (IS_ALIGNED(dst_width, 4)) { ScaleARGBFilterCols = ScaleARGBCols_MSA; } } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif #if defined(HAS_SCALEARGBCOLSUP2_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { ScaleARGBFilterCols = ScaleARGBColsUp2_MMI; } #endif } if (y > max_y) { y = max_y; } { int yi = y >> 16; const uint8_t* src = src_argb + yi * (int64_t)src_stride; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); if (src_height > 1) { src += src_stride; } ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx); src += src_stride; for (j = 0; j < dst_height; ++j) { yi = y >> 16; if (yi != lasty) { if (y > max_y) { y = max_y; yi = y >> 16; src = src_argb + yi * (int64_t)src_stride; } if (yi != lasty) { ScaleARGBFilterCols(rowptr, src, dst_width, x, dx); rowptr += rowstride; rowstride = -rowstride; lasty = yi; src += src_stride; } } if (filtering == kFilterLinear) { InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); } else { int yf = (y >> 8) & 255; InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); } dst_argb += dst_stride; y += dy; } free_aligned_buffer_64(row); } } #ifdef YUVSCALEUP // Scale YUV to ARGB up with bilinear interpolation. static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, int dst_width, int dst_height, int src_stride_y, int src_stride_u, int src_stride_v, int dst_stride_argb, const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_argb, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, uint8_t* rgb_buf, int width) = I422ToARGBRow_C; #if defined(HAS_I422TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(src_width, 8)) { I422ToARGBRow = I422ToARGBRow_SSSE3; } } #endif #if defined(HAS_I422TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { I422ToARGBRow = I422ToARGBRow_Any_AVX2; if (IS_ALIGNED(src_width, 16)) { I422ToARGBRow = I422ToARGBRow_AVX2; } } #endif #if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(src_width, 8)) { I422ToARGBRow = I422ToARGBRow_NEON; } } #endif #if defined(HAS_I422TOARGBROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { I422ToARGBRow = I422ToARGBRow_Any_MMI; if (IS_ALIGNED(src_width, 4)) { I422ToARGBRow = I422ToARGBRow_MMI; } } #endif #if defined(HAS_I422TOARGBROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { I422ToARGBRow = I422ToARGBRow_Any_MSA; if (IS_ALIGNED(src_width, 8)) { I422ToARGBRow = I422ToARGBRow_MSA; } } #endif void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(dst_width, 8)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { InterpolateRow = InterpolateRow_NEON; } } #endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; if (IS_ALIGNED(dst_width, 8)) { InterpolateRow = InterpolateRow_MSA; } } #endif void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C; if (src_width >= 32768) { ScaleARGBFilterCols = filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C; } #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3) if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif #if defined(HAS_SCALEARGBFILTERCOLS_NEON) if (filtering && TestCpuFlag(kCpuHasNEON)) { ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; } } #endif #if defined(HAS_SCALEARGBFILTERCOLS_MSA) if (filtering && TestCpuFlag(kCpuHasMSA)) { ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA; if (IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBFilterCols_MSA; } } #endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; } #endif #if defined(HAS_SCALEARGBCOLS_NEON) if (!filtering && TestCpuFlag(kCpuHasNEON)) { ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBCols_NEON; } } #endif #if defined(HAS_SCALEARGBCOLS_MMI) if (!filtering && TestCpuFlag(kCpuHasMMI)) { ScaleARGBFilterCols = ScaleARGBCols_Any_MMI; if (IS_ALIGNED(dst_width, 1)) { ScaleARGBFilterCols = ScaleARGBCols_MMI; } } #endif #if defined(HAS_SCALEARGBCOLS_MSA) if (!filtering && TestCpuFlag(kCpuHasMSA)) { ScaleARGBFilterCols = ScaleARGBCols_Any_MSA; if (IS_ALIGNED(dst_width, 4)) { ScaleARGBFilterCols = ScaleARGBCols_MSA; } } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2; } #endif #if defined(HAS_SCALEARGBCOLSUP2_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { ScaleARGBFilterCols = ScaleARGBColsUp2_MMI; } #endif } const int max_y = (src_height - 1) << 16; if (y > max_y) { y = max_y; } const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate. int yi = y >> 16; int uv_yi = yi >> kYShift; const uint8_t* src_row_y = src_y + yi * (int64_t)src_stride_y; const uint8_t* src_row_u = src_u + uv_yi * (int64_t)src_stride_u; const uint8_t* src_row_v = src_v + uv_yi * (int64_t)src_stride_v; // Allocate 2 rows of ARGB. const int kRowSize = (dst_width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); // Allocate 1 row of ARGB for source conversion. align_buffer_64(argb_row, src_width * 4); uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; // TODO(fbarchard): Convert first 2 rows of YUV to ARGB. ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx); if (src_height > 1) { src_row_y += src_stride_y; if (yi & 1) { src_row_u += src_stride_u; src_row_v += src_stride_v; } } ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx); if (src_height > 2) { src_row_y += src_stride_y; if (!(yi & 1)) { src_row_u += src_stride_u; src_row_v += src_stride_v; } } for (j = 0; j < dst_height; ++j) { yi = y >> 16; if (yi != lasty) { if (y > max_y) { y = max_y; yi = y >> 16; uv_yi = yi >> kYShift; src_row_y = src_y + yi * (int64_t)src_stride_y; src_row_u = src_u + uv_yi * (int64_t)src_stride_u; src_row_v = src_v + uv_yi * (int64_t)src_stride_v; } if (yi != lasty) { // TODO(fbarchard): Convert the clipped region of row. I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width); ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx); rowptr += rowstride; rowstride = -rowstride; lasty = yi; src_row_y += src_stride_y; if (yi & 1) { src_row_u += src_stride_u; src_row_v += src_stride_v; } } } if (filtering == kFilterLinear) { InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0); } else { int yf = (y >> 8) & 255; InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf); } dst_argb += dst_stride_argb; y += dy; } free_aligned_buffer_64(row); free_aligned_buffer_64(row_argb); } #endif // Scale ARGB to/from any dimensions, without interpolation. // Fixed point math is used for performance: The upper 16 bits // of x and dx is the integer part of the source position and // the lower 16 bits are the fixed decimal part. static void ScaleARGBSimple(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_argb, uint8_t* dst_argb, int x, int dx, int y, int dy) { int j; void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C; (void)src_height; #if defined(HAS_SCALEARGBCOLS_SSE2) if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBCols = ScaleARGBCols_SSE2; } #endif #if defined(HAS_SCALEARGBCOLS_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleARGBCols = ScaleARGBCols_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleARGBCols = ScaleARGBCols_NEON; } } #endif #if defined(HAS_SCALEARGBCOLS_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleARGBCols = ScaleARGBCols_Any_MMI; if (IS_ALIGNED(dst_width, 1)) { ScaleARGBCols = ScaleARGBCols_MMI; } } #endif #if defined(HAS_SCALEARGBCOLS_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleARGBCols = ScaleARGBCols_Any_MSA; if (IS_ALIGNED(dst_width, 4)) { ScaleARGBCols = ScaleARGBCols_MSA; } } #endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { ScaleARGBCols = ScaleARGBColsUp2_SSE2; } #endif #if defined(HAS_SCALEARGBCOLSUP2_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { ScaleARGBCols = ScaleARGBColsUp2_MMI; } #endif } for (j = 0; j < dst_height; ++j) { ScaleARGBCols(dst_argb, src_argb + (y >> 16) * (int64_t)src_stride, dst_width, x, dx); dst_argb += dst_stride; y += dy; } } // ScaleARGB a ARGB. // This function in turn calls a scaling function // suitable for handling the desired resolutions. static void ScaleARGB(const uint8_t* src, int src_stride, int src_width, int src_height, uint8_t* dst, int dst_stride, int dst_width, int dst_height, int clip_x, int clip_y, int clip_width, int clip_height, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; // ARGB does not support box filter yet, but allow the user to pass it. // Simplify filtering when possible. filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, filtering); // Negative src_height means invert the image. if (src_height < 0) { src_height = -src_height; src = src + (src_height - 1) * (int64_t)src_stride; src_stride = -src_stride; } ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, &dx, &dy); src_width = Abs(src_width); if (clip_x) { int64_t clipf = (int64_t)(clip_x)*dx; x += (clipf & 0xffff); src += (clipf >> 16) * 4; dst += clip_x * 4; } if (clip_y) { int64_t clipf = (int64_t)(clip_y)*dy; y += (clipf & 0xffff); src += (clipf >> 16) * (int64_t)src_stride; dst += clip_y * dst_stride; } // Special case for integer step values. if (((dx | dy) & 0xffff) == 0) { if (!dx || !dy) { // 1 pixel wide and/or tall. filtering = kFilterNone; } else { // Optimized even scale down. ie 2, 4, 6, 8, 10x. if (!(dx & 0x10000) && !(dy & 0x10000)) { if (dx == 0x20000) { // Optimized 1/2 downsample. ScaleARGBDown2(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy, filtering); return; } if (dx == 0x40000 && filtering == kFilterBox) { // Optimized 1/4 box downsample. ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy); return; } ScaleARGBDownEven(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy, filtering); return; } // Optimized odd scale down. ie 3, 5, 7, 9x. if ((dx & 0x10000) && (dy & 0x10000)) { filtering = kFilterNone; if (dx == 0x10000 && dy == 0x10000) { // Straight copy. ARGBCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 4, src_stride, dst, dst_stride, clip_width, clip_height); return; } } } } if (dx == 0x10000 && (x & 0xffff) == 0) { // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, y, dy, 4, filtering); return; } if (filtering && dy < 65536) { ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy, filtering); return; } if (filtering) { ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy, filtering); return; } ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy); } LIBYUV_API int ARGBScaleClip(const uint8_t* src_argb, int src_stride_argb, int src_width, int src_height, uint8_t* dst_argb, int dst_stride_argb, int dst_width, int dst_height, int clip_x, int clip_y, int clip_width, int clip_height, enum FilterMode filtering) { if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb || dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 || clip_width > 32768 || clip_height > 32768 || (clip_x + clip_width) > dst_width || (clip_y + clip_height) > dst_height) { return -1; } ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width, clip_height, filtering); return 0; } // Scale an ARGB image. LIBYUV_API int ARGBScale(const uint8_t* src_argb, int src_stride_argb, int src_width, int src_height, uint8_t* dst_argb, int dst_stride_argb, int dst_width, int dst_height, enum FilterMode filtering) { if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) { return -1; } ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb, dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height, filtering); return 0; } // Scale with YUV conversion to ARGB and clipping. LIBYUV_API int YUVToARGBScaleClip(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint32_t src_fourcc, int src_width, int src_height, uint8_t* dst_argb, int dst_stride_argb, uint32_t dst_fourcc, int dst_width, int dst_height, int clip_x, int clip_y, int clip_width, int clip_height, enum FilterMode filtering) { uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4); int r; (void)src_fourcc; // TODO(fbarchard): implement and/or assert. (void)dst_fourcc; I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, argb_buffer, src_width * 4, src_width, src_height); r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb, dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width, clip_height, filtering); free(argb_buffer); return r; } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/scale_common.cc000066400000000000000000001522301416500237200217770ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/scale.h" #include #include #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" // For CopyARGB #include "libyuv/row.h" #include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif static __inline int Abs(int v) { return v >= 0 ? v : -v; } // CPU agnostic row functions void ScaleRowDown2_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { int x; (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[1]; dst[1] = src_ptr[3]; dst += 2; src_ptr += 4; } if (dst_width & 1) { dst[0] = src_ptr[1]; } } void ScaleRowDown2_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { int x; (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[1]; dst[1] = src_ptr[3]; dst += 2; src_ptr += 4; } if (dst_width & 1) { dst[0] = src_ptr[1]; } } void ScaleRowDown2Linear_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { const uint8_t* s = src_ptr; int x; (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + 1) >> 1; dst[1] = (s[2] + s[3] + 1) >> 1; dst += 2; s += 4; } if (dst_width & 1) { dst[0] = (s[0] + s[1] + 1) >> 1; } } void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { const uint16_t* s = src_ptr; int x; (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + 1) >> 1; dst[1] = (s[2] + s[3] + 1) >> 1; dst += 2; s += 4; } if (dst_width & 1) { dst[0] = (s[0] + s[1] + 1) >> 1; } } void ScaleRowDown2Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; dst += 2; s += 4; t += 4; } if (dst_width & 1) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; } } void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; int x; dst_width -= 1; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; dst += 2; s += 4; t += 4; } if (dst_width & 1) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; dst += 1; s += 2; t += 2; } dst[0] = (s[0] + t[0] + 1) >> 1; } void ScaleRowDown2Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { const uint16_t* s = src_ptr; const uint16_t* t = src_ptr + src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; dst += 2; s += 4; t += 4; } if (dst_width & 1) { dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; } } void ScaleRowDown4_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { int x; (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[2]; dst[1] = src_ptr[6]; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = src_ptr[2]; } } void ScaleRowDown4_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { int x; (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src_ptr[2]; dst[1] = src_ptr[6]; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = src_ptr[2]; } } void ScaleRowDown4Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + 8) >> 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + 8) >> 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + 8) >> 4; } } void ScaleRowDown4Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { intptr_t stride = src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + 8) >> 4; dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] + src_ptr[stride + 7] + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + 8) >> 4; dst += 2; src_ptr += 8; } if (dst_width & 1) { dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride + 3] + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + 8) >> 4; } } void ScaleRowDown34_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { int x; (void)src_stride; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; dst[1] = src_ptr[1]; dst[2] = src_ptr[3]; dst += 3; src_ptr += 4; } } void ScaleRowDown34_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { int x; (void)src_stride; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; dst[1] = src_ptr[1]; dst[2] = src_ptr[3]; dst += 3; src_ptr += 4; } } // Filter rows 0 and 1 together, 3 : 1 void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* d, int dst_width) { const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 * 3 + b0 + 2) >> 2; d[1] = (a1 * 3 + b1 + 2) >> 2; d[2] = (a2 * 3 + b2 + 2) >> 2; d += 3; s += 4; t += 4; } } void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* d, int dst_width) { const uint16_t* s = src_ptr; const uint16_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 * 3 + b0 + 2) >> 2; d[1] = (a1 * 3 + b1 + 2) >> 2; d[2] = (a2 * 3 + b2 + 2) >> 2; d += 3; s += 4; t += 4; } } // Filter rows 1 and 2 together, 1 : 1 void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* d, int dst_width) { const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 + b0 + 1) >> 1; d[1] = (a1 + b1 + 1) >> 1; d[2] = (a2 + b2 + 1) >> 1; d += 3; s += 4; t += 4; } } void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* d, int dst_width) { const uint16_t* s = src_ptr; const uint16_t* t = src_ptr + src_stride; int x; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 3) { uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; d[0] = (a0 + b0 + 1) >> 1; d[1] = (a1 + b1 + 1) >> 1; d[2] = (a2 + b2 + 1) >> 1; d += 3; s += 4; t += 4; } } // Sample position: (O is src sample position, X is dst sample position) // // v dst_ptr at here v stop at here // X O X X O X X O X X O X X O X // ^ src_ptr at here void ScaleRowUp2_Linear_C(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { int src_width = dst_width >> 1; int x; assert((dst_width % 2 == 0) && (dst_width >= 0)); for (x = 0; x < src_width; ++x) { dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; } } // Sample position: (O is src sample position, X is dst sample position) // // src_ptr at here // X v X X X X X X X X X // O O O O O // X X X X X X X X X X // ^ dst_ptr at here ^ stop at here // X X X X X X X X X X // O O O O O // X X X X X X X X X X void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; uint8_t* d = dst_ptr; uint8_t* e = dst_ptr + dst_stride; int src_width = dst_width >> 1; int x; assert((dst_width % 2 == 0) && (dst_width >= 0)); for (x = 0; x < src_width; ++x) { d[2 * x + 0] = (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; d[2 * x + 1] = (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; e[2 * x + 0] = (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; e[2 * x + 1] = (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; } } // Only suitable for at most 14 bit range. void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { int src_width = dst_width >> 1; int x; assert((dst_width % 2 == 0) && (dst_width >= 0)); for (x = 0; x < src_width; ++x) { dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2; dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2; } } // Only suitable for at most 12bit range. void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint16_t* s = src_ptr; const uint16_t* t = src_ptr + src_stride; uint16_t* d = dst_ptr; uint16_t* e = dst_ptr + dst_stride; int src_width = dst_width >> 1; int x; assert((dst_width % 2 == 0) && (dst_width >= 0)); for (x = 0; x < src_width; ++x) { d[2 * x + 0] = (s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4; d[2 * x + 1] = (s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4; e[2 * x + 0] = (s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4; e[2 * x + 1] = (s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4; } } // Scales a single row of pixels using point sampling. void ScaleCols_C(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[0] = src_ptr[x >> 16]; x += dx; dst_ptr[1] = src_ptr[x >> 16]; x += dx; dst_ptr += 2; } if (dst_width & 1) { dst_ptr[0] = src_ptr[x >> 16]; } } void ScaleCols_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[0] = src_ptr[x >> 16]; x += dx; dst_ptr[1] = src_ptr[x >> 16]; x += dx; dst_ptr += 2; } if (dst_width & 1) { dst_ptr[0] = src_ptr[x >> 16]; } } // Scales a single row of pixels up by 2x using point sampling. void ScaleColsUp2_C(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) { int j; (void)x; (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[1] = dst_ptr[0] = src_ptr[0]; src_ptr += 1; dst_ptr += 2; } if (dst_width & 1) { dst_ptr[0] = src_ptr[0]; } } void ScaleColsUp2_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) { int j; (void)x; (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst_ptr[1] = dst_ptr[0] = src_ptr[0]; src_ptr += 1; dst_ptr += 2; } if (dst_width & 1) { dst_ptr[0] = src_ptr[0]; } } // (1-f)a + fb can be replaced with a + f(b-a) #if defined(__arm__) || defined(__aarch64__) #define BLENDER(a, b, f) \ (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) #else // Intel uses 7 bit math with rounding. #define BLENDER(a, b, f) \ (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7)) #endif void ScaleFilterCols_C(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); x += dx; xi = x >> 16; a = src_ptr[xi]; b = src_ptr[xi + 1]; dst_ptr[1] = BLENDER(a, b, x & 0xffff); x += dx; dst_ptr += 2; } if (dst_width & 1) { int xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); } } void ScaleFilterCols64_C(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x32, int dx) { int64_t x = (int64_t)(x32); int j; for (j = 0; j < dst_width - 1; j += 2) { int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); x += dx; xi = x >> 16; a = src_ptr[xi]; b = src_ptr[xi + 1]; dst_ptr[1] = BLENDER(a, b, x & 0xffff); x += dx; dst_ptr += 2; } if (dst_width & 1) { int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); } } #undef BLENDER // Same as 8 bit arm blender but return is cast to uint16_t #define BLENDER(a, b, f) \ (uint16_t)( \ (int)(a) + \ (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16)) void ScaleFilterCols_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) { int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); x += dx; xi = x >> 16; a = src_ptr[xi]; b = src_ptr[xi + 1]; dst_ptr[1] = BLENDER(a, b, x & 0xffff); x += dx; dst_ptr += 2; } if (dst_width & 1) { int xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); } } void ScaleFilterCols64_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x32, int dx) { int64_t x = (int64_t)(x32); int j; for (j = 0; j < dst_width - 1; j += 2) { int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); x += dx; xi = x >> 16; a = src_ptr[xi]; b = src_ptr[xi + 1]; dst_ptr[1] = BLENDER(a, b, x & 0xffff); x += dx; dst_ptr += 2; } if (dst_width & 1) { int64_t xi = x >> 16; int a = src_ptr[xi]; int b = src_ptr[xi + 1]; dst_ptr[0] = BLENDER(a, b, x & 0xffff); } } #undef BLENDER void ScaleRowDown38_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { int x; (void)src_stride; assert(dst_width % 3 == 0); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; dst[1] = src_ptr[3]; dst[2] = src_ptr[6]; dst += 3; src_ptr += 8; } } void ScaleRowDown38_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { int x; (void)src_stride; assert(dst_width % 3 == 0); for (x = 0; x < dst_width; x += 3) { dst[0] = src_ptr[0]; dst[1] = src_ptr[3]; dst[2] = src_ptr[6]; dst += 3; src_ptr += 8; } } // 8x3 -> 3x1 void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * (65536 / 9) >> 16; dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * (65536 / 9) >> 16; dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * (65536 / 6) >> 16; src_ptr += 8; dst_ptr += 3; } } void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * (65536 / 9) >> 16; dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * (65536 / 9) >> 16; dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] + src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * (65536 / 6) >> 16; src_ptr += 8; dst_ptr += 3; } } // 8x2 -> 3x1 void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2]) * (65536 / 6) >> 16; dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + src_ptr[stride + 4] + src_ptr[stride + 5]) * (65536 / 6) >> 16; dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * (65536 / 4) >> 16; src_ptr += 8; dst_ptr += 3; } } void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, int dst_width) { intptr_t stride = src_stride; int i; assert((dst_width % 3 == 0) && (dst_width > 0)); for (i = 0; i < dst_width; i += 3) { dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2]) * (65536 / 6) >> 16; dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] + src_ptr[stride + 4] + src_ptr[stride + 5]) * (65536 / 6) >> 16; dst_ptr[2] = (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) * (65536 / 4) >> 16; src_ptr += 8; dst_ptr += 3; } } void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { int x; assert(src_width > 0); for (x = 0; x < src_width - 1; x += 2) { dst_ptr[0] += src_ptr[0]; dst_ptr[1] += src_ptr[1]; src_ptr += 2; dst_ptr += 2; } if (src_width & 1) { dst_ptr[0] += src_ptr[0]; } } void ScaleAddRow_16_C(const uint16_t* src_ptr, uint32_t* dst_ptr, int src_width) { int x; assert(src_width > 0); for (x = 0; x < src_width - 1; x += 2) { dst_ptr[0] += src_ptr[0]; dst_ptr[1] += src_ptr[1]; src_ptr += 2; dst_ptr += 2; } if (src_width & 1) { dst_ptr[0] += src_ptr[0]; } } // ARGB scale row functions void ScaleARGBRowDown2_C(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { const uint32_t* src = (const uint32_t*)(src_argb); uint32_t* dst = (uint32_t*)(dst_argb); int x; (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src[1]; dst[1] = src[3]; src += 4; dst += 2; } if (dst_width & 1) { dst[0] = src[1]; } } void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { int x; (void)src_stride; for (x = 0; x < dst_width; ++x) { dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1; dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1; dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1; dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1; src_argb += 8; dst_argb += 4; } } void ScaleARGBRowDown2Box_C(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { int x; for (x = 0; x < dst_width; ++x) { dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; src_argb += 8; dst_argb += 4; } } void ScaleARGBRowDownEven_C(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { const uint32_t* src = (const uint32_t*)(src_argb); uint32_t* dst = (uint32_t*)(dst_argb); (void)src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src[0]; dst[1] = src[src_stepx]; src += src_stepx * 2; dst += 2; } if (dst_width & 1) { dst[0] = src[0]; } } void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { int x; for (x = 0; x < dst_width; ++x) { dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2; dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2; dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2; dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2; src_argb += src_stepx * 4; dst_argb += 4; } } // Scales a single row of pixels using point sampling. void ScaleARGBCols_C(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { const uint32_t* src = (const uint32_t*)(src_argb); uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { dst[0] = src[x >> 16]; x += dx; dst[1] = src[x >> 16]; x += dx; dst += 2; } if (dst_width & 1) { dst[0] = src[x >> 16]; } } void ScaleARGBCols64_C(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x32, int dx) { int64_t x = (int64_t)(x32); const uint32_t* src = (const uint32_t*)(src_argb); uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { dst[0] = src[x >> 16]; x += dx; dst[1] = src[x >> 16]; x += dx; dst += 2; } if (dst_width & 1) { dst[0] = src[x >> 16]; } } // Scales a single row of pixels up by 2x using point sampling. void ScaleARGBColsUp2_C(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { const uint32_t* src = (const uint32_t*)(src_argb); uint32_t* dst = (uint32_t*)(dst_argb); int j; (void)x; (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst[1] = dst[0] = src[0]; src += 1; dst += 2; } if (dst_width & 1) { dst[0] = src[0]; } } // TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. // Mimics SSSE3 blender #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 #define BLENDERC(a, b, f, s) \ (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) #define BLENDER(a, b, f) \ BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \ BLENDERC(a, b, f, 0) void ScaleARGBFilterCols_C(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { const uint32_t* src = (const uint32_t*)(src_argb); uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; int xf = (x >> 9) & 0x7f; uint32_t a = src[xi]; uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); x += dx; xi = x >> 16; xf = (x >> 9) & 0x7f; a = src[xi]; b = src[xi + 1]; dst[1] = BLENDER(a, b, xf); x += dx; dst += 2; } if (dst_width & 1) { int xi = x >> 16; int xf = (x >> 9) & 0x7f; uint32_t a = src[xi]; uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); } } void ScaleARGBFilterCols64_C(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x32, int dx) { int64_t x = (int64_t)(x32); const uint32_t* src = (const uint32_t*)(src_argb); uint32_t* dst = (uint32_t*)(dst_argb); int j; for (j = 0; j < dst_width - 1; j += 2) { int64_t xi = x >> 16; int xf = (x >> 9) & 0x7f; uint32_t a = src[xi]; uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); x += dx; xi = x >> 16; xf = (x >> 9) & 0x7f; a = src[xi]; b = src[xi + 1]; dst[1] = BLENDER(a, b, xf); x += dx; dst += 2; } if (dst_width & 1) { int64_t xi = x >> 16; int xf = (x >> 9) & 0x7f; uint32_t a = src[xi]; uint32_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); } } #undef BLENDER1 #undef BLENDERC #undef BLENDER // UV scale row functions // same as ARGB but 2 channels void ScaleUVRowDown2_C(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width) { const uint16_t* src = (const uint16_t*)(src_uv); uint16_t* dst = (uint16_t*)(dst_uv); int x; (void)src_stride; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src[1]; dst[1] = src[3]; src += 2; dst += 2; } if (dst_width & 1) { dst[0] = src[1]; } } void ScaleUVRowDown2Linear_C(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width) { int x; (void)src_stride; for (x = 0; x < dst_width; ++x) { dst_uv[0] = (src_uv[0] + src_uv[2] + 1) >> 1; dst_uv[1] = (src_uv[1] + src_uv[3] + 1) >> 1; src_uv += 4; dst_uv += 2; } } void ScaleUVRowDown2Box_C(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width) { int x; for (x = 0; x < dst_width; ++x) { dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + src_uv[src_stride + 2] + 2) >> 2; dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + src_uv[src_stride + 3] + 2) >> 2; src_uv += 4; dst_uv += 2; } } void ScaleUVRowDownEven_C(const uint8_t* src_uv, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width) { const uint16_t* src = (const uint16_t*)(src_uv); uint16_t* dst = (uint16_t*)(dst_uv); (void)src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { dst[0] = src[0]; dst[1] = src[src_stepx]; src += src_stepx * 2; dst += 2; } if (dst_width & 1) { dst[0] = src[0]; } } void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_uv, int dst_width) { int x; for (x = 0; x < dst_width; ++x) { dst_uv[0] = (src_uv[0] + src_uv[2] + src_uv[src_stride] + src_uv[src_stride + 2] + 2) >> 2; dst_uv[1] = (src_uv[1] + src_uv[3] + src_uv[src_stride + 1] + src_uv[src_stride + 3] + 2) >> 2; src_uv += src_stepx * 2; dst_uv += 2; } } void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { int src_width = dst_width >> 1; int x; assert((dst_width % 2 == 0) && (dst_width >= 0)); for (x = 0; x < src_width; ++x) { dst_ptr[4 * x + 0] = (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; dst_ptr[4 * x + 1] = (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; dst_ptr[4 * x + 2] = (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; dst_ptr[4 * x + 3] = (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; } } void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; uint8_t* d = dst_ptr; uint8_t* e = dst_ptr + dst_stride; int src_width = dst_width >> 1; int x; assert((dst_width % 2 == 0) && (dst_width >= 0)); for (x = 0; x < src_width; ++x) { d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + t[2 * x + 2] * 1 + 8) >> 4; d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + t[2 * x + 3] * 1 + 8) >> 4; d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + t[2 * x + 2] * 3 + 8) >> 4; d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + t[2 * x + 3] * 3 + 8) >> 4; e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + t[2 * x + 2] * 3 + 8) >> 4; e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + t[2 * x + 3] * 3 + 8) >> 4; e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + t[2 * x + 2] * 9 + 8) >> 4; e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + t[2 * x + 3] * 9 + 8) >> 4; } } void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { int src_width = dst_width >> 1; int x; assert((dst_width % 2 == 0) && (dst_width >= 0)); for (x = 0; x < src_width; ++x) { dst_ptr[4 * x + 0] = (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; dst_ptr[4 * x + 1] = (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; dst_ptr[4 * x + 2] = (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; dst_ptr[4 * x + 3] = (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; } } void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint16_t* s = src_ptr; const uint16_t* t = src_ptr + src_stride; uint16_t* d = dst_ptr; uint16_t* e = dst_ptr + dst_stride; int src_width = dst_width >> 1; int x; assert((dst_width % 2 == 0) && (dst_width >= 0)); for (x = 0; x < src_width; ++x) { d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + t[2 * x + 2] * 1 + 8) >> 4; d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + t[2 * x + 3] * 1 + 8) >> 4; d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + t[2 * x + 2] * 3 + 8) >> 4; d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + t[2 * x + 3] * 3 + 8) >> 4; e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + t[2 * x + 2] * 3 + 8) >> 4; e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + t[2 * x + 3] * 3 + 8) >> 4; e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + t[2 * x + 2] * 9 + 8) >> 4; e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + t[2 * x + 3] * 9 + 8) >> 4; } } // Scales a single row of pixels using point sampling. void ScaleUVCols_C(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x, int dx) { const uint16_t* src = (const uint16_t*)(src_uv); uint16_t* dst = (uint16_t*)(dst_uv); int j; for (j = 0; j < dst_width - 1; j += 2) { dst[0] = src[x >> 16]; x += dx; dst[1] = src[x >> 16]; x += dx; dst += 2; } if (dst_width & 1) { dst[0] = src[x >> 16]; } } void ScaleUVCols64_C(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x32, int dx) { int64_t x = (int64_t)(x32); const uint16_t* src = (const uint16_t*)(src_uv); uint16_t* dst = (uint16_t*)(dst_uv); int j; for (j = 0; j < dst_width - 1; j += 2) { dst[0] = src[x >> 16]; x += dx; dst[1] = src[x >> 16]; x += dx; dst += 2; } if (dst_width & 1) { dst[0] = src[x >> 16]; } } // Scales a single row of pixels up by 2x using point sampling. void ScaleUVColsUp2_C(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x, int dx) { const uint16_t* src = (const uint16_t*)(src_uv); uint16_t* dst = (uint16_t*)(dst_uv); int j; (void)x; (void)dx; for (j = 0; j < dst_width - 1; j += 2) { dst[1] = dst[0] = src[0]; src += 1; dst += 2; } if (dst_width & 1) { dst[0] = src[0]; } } // TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. // Mimics SSSE3 blender #define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 #define BLENDERC(a, b, f, s) \ (uint16_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) #define BLENDER(a, b, f) BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) void ScaleUVFilterCols_C(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x, int dx) { const uint16_t* src = (const uint16_t*)(src_uv); uint16_t* dst = (uint16_t*)(dst_uv); int j; for (j = 0; j < dst_width - 1; j += 2) { int xi = x >> 16; int xf = (x >> 9) & 0x7f; uint16_t a = src[xi]; uint16_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); x += dx; xi = x >> 16; xf = (x >> 9) & 0x7f; a = src[xi]; b = src[xi + 1]; dst[1] = BLENDER(a, b, xf); x += dx; dst += 2; } if (dst_width & 1) { int xi = x >> 16; int xf = (x >> 9) & 0x7f; uint16_t a = src[xi]; uint16_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); } } void ScaleUVFilterCols64_C(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, int x32, int dx) { int64_t x = (int64_t)(x32); const uint16_t* src = (const uint16_t*)(src_uv); uint16_t* dst = (uint16_t*)(dst_uv); int j; for (j = 0; j < dst_width - 1; j += 2) { int64_t xi = x >> 16; int xf = (x >> 9) & 0x7f; uint16_t a = src[xi]; uint16_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); x += dx; xi = x >> 16; xf = (x >> 9) & 0x7f; a = src[xi]; b = src[xi + 1]; dst[1] = BLENDER(a, b, xf); x += dx; dst += 2; } if (dst_width & 1) { int64_t xi = x >> 16; int xf = (x >> 9) & 0x7f; uint16_t a = src[xi]; uint16_t b = src[xi + 1]; dst[0] = BLENDER(a, b, xf); } } #undef BLENDER1 #undef BLENDERC #undef BLENDER // Scale plane vertically with bilinear interpolation. void ScalePlaneVertical(int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_argb, uint8_t* dst_argb, int x, int y, int dy, int bpp, enum FilterMode filtering) { // TODO(fbarchard): Allow higher bpp. int dst_width_bytes = dst_width * bpp; void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; int j; assert(bpp >= 1 && bpp <= 4); assert(src_height != 0); assert(dst_width > 0); assert(dst_height > 0); src_argb += (x >> 16) * bpp; #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width_bytes, 16)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(dst_width_bytes, 32)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(dst_width_bytes, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; if (IS_ALIGNED(dst_width_bytes, 8)) { InterpolateRow = InterpolateRow_MMI; } } #endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; if (IS_ALIGNED(dst_width_bytes, 32)) { InterpolateRow = InterpolateRow_MSA; } } #endif for (j = 0; j < dst_height; ++j) { int yi; int yf; if (y > max_y) { y = max_y; } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, dst_width_bytes, yf); dst_argb += dst_stride; y += dy; } } void ScalePlaneVertical_16(int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_argb, uint16_t* dst_argb, int x, int y, int dy, int wpp, enum FilterMode filtering) { // TODO(fbarchard): Allow higher wpp. int dst_width_words = dst_width * wpp; void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; int j; assert(wpp >= 1 && wpp <= 2); assert(src_height != 0); assert(dst_width > 0); assert(dst_height > 0); src_argb += (x >> 16) * wpp; #if defined(HAS_INTERPOLATEROW_16_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { InterpolateRow = InterpolateRow_Any_16_SSE2; if (IS_ALIGNED(dst_width_bytes, 16)) { InterpolateRow = InterpolateRow_16_SSE2; } } #endif #if defined(HAS_INTERPOLATEROW_16_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_16_SSSE3; if (IS_ALIGNED(dst_width_bytes, 16)) { InterpolateRow = InterpolateRow_16_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_16_AVX2; if (IS_ALIGNED(dst_width_bytes, 32)) { InterpolateRow = InterpolateRow_16_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_16_NEON; if (IS_ALIGNED(dst_width_bytes, 16)) { InterpolateRow = InterpolateRow_16_NEON; } } #endif for (j = 0; j < dst_height; ++j) { int yi; int yf; if (y > max_y) { y = max_y; } yi = y >> 16; yf = filtering ? ((y >> 8) & 255) : 0; InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride, dst_width_words, yf); dst_argb += dst_stride; y += dy; } } // Simplify the filtering based on scale factors. enum FilterMode ScaleFilterReduce(int src_width, int src_height, int dst_width, int dst_height, enum FilterMode filtering) { if (src_width < 0) { src_width = -src_width; } if (src_height < 0) { src_height = -src_height; } if (filtering == kFilterBox) { // If scaling either axis to 0.5 or larger, switch from Box to Bilinear. if (dst_width * 2 >= src_width || dst_height * 2 >= src_height) { filtering = kFilterBilinear; } } if (filtering == kFilterBilinear) { if (src_height == 1) { filtering = kFilterLinear; } // TODO(fbarchard): Detect any odd scale factor and reduce to Linear. if (dst_height == src_height || dst_height * 3 == src_height) { filtering = kFilterLinear; } // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to // avoid reading 2 pixels horizontally that causes memory exception. if (src_width == 1) { filtering = kFilterNone; } } if (filtering == kFilterLinear) { if (src_width == 1) { filtering = kFilterNone; } // TODO(fbarchard): Detect any odd scale factor and reduce to None. if (dst_width == src_width || dst_width * 3 == src_width) { filtering = kFilterNone; } } return filtering; } // Divide num by div and return as 16.16 fixed point result. int FixedDiv_C(int num, int div) { return (int)(((int64_t)(num) << 16) / div); } // Divide num by div and return as 16.16 fixed point result. int FixedDiv1_C(int num, int div) { return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1)); } #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) // Compute slope values for stepping. void ScaleSlope(int src_width, int src_height, int dst_width, int dst_height, enum FilterMode filtering, int* x, int* y, int* dx, int* dy) { assert(x != NULL); assert(y != NULL); assert(dx != NULL); assert(dy != NULL); assert(src_width != 0); assert(src_height != 0); assert(dst_width > 0); assert(dst_height > 0); // Check for 1 pixel and avoid FixedDiv overflow. if (dst_width == 1 && src_width >= 32768) { dst_width = src_width; } if (dst_height == 1 && src_height >= 32768) { dst_height = src_height; } if (filtering == kFilterBox) { // Scale step for point sampling duplicates all pixels equally. *dx = FixedDiv(Abs(src_width), dst_width); *dy = FixedDiv(src_height, dst_height); *x = 0; *y = 0; } else if (filtering == kFilterBilinear) { // Scale step for bilinear sampling renders last pixel once for upsample. if (dst_width <= Abs(src_width)) { *dx = FixedDiv(Abs(src_width), dst_width); *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. } else if (dst_width > 1) { *dx = FixedDiv1(Abs(src_width), dst_width); *x = 0; } if (dst_height <= src_height) { *dy = FixedDiv(src_height, dst_height); *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. } else if (dst_height > 1) { *dy = FixedDiv1(src_height, dst_height); *y = 0; } } else if (filtering == kFilterLinear) { // Scale step for bilinear sampling renders last pixel once for upsample. if (dst_width <= Abs(src_width)) { *dx = FixedDiv(Abs(src_width), dst_width); *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. } else if (dst_width > 1) { *dx = FixedDiv1(Abs(src_width), dst_width); *x = 0; } *dy = FixedDiv(src_height, dst_height); *y = *dy >> 1; } else { // Scale step for point sampling duplicates all pixels equally. *dx = FixedDiv(Abs(src_width), dst_width); *dy = FixedDiv(src_height, dst_height); *x = CENTERSTART(*dx, 0); *y = CENTERSTART(*dy, 0); } // Negative src_width means horizontally mirror. if (src_width < 0) { *x += (dst_width - 1) * *dx; *dx = -*dx; // src_width = -src_width; // Caller must do this. } } #undef CENTERSTART // Read 8x2 upsample with filtering and write 16x1. // actually reads an extra pixel, so 9x2. void ScaleRowUp2_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { const uint16_t* src2 = src_ptr + src_stride; int x; for (x = 0; x < dst_width - 1; x += 2) { uint16_t p0 = src_ptr[0]; uint16_t p1 = src_ptr[1]; uint16_t p2 = src2[0]; uint16_t p3 = src2[1]; dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4; ++src_ptr; ++src2; dst += 2; } if (dst_width & 1) { uint16_t p0 = src_ptr[0]; uint16_t p1 = src_ptr[1]; uint16_t p2 = src2[0]; uint16_t p3 = src2[1]; dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4; } } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/scale_gcc.cc000066400000000000000000004246531416500237200212560ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/row.h" #include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for GCC x86 and x64. #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) // Offsets for source bytes 0 to 9 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 0 to 10 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; // Coefficients for source bytes 0 to 10 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; // Coefficients for source bytes 10 to 21 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; // Coefficients for source bytes 21 to 31 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; // Coefficients for source bytes 21 to 31 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128}; // Arrange words 0,3,6 into 0,1,2 static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; // Arrange words 0,3,6 into 3,4,5 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128}; // Scaling values for boxes of 3x3 and 2x3 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0}; // Arrange first value for pixels 0,1,2,3,4,5 static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128}; // Arrange second value for pixels 0,1,2,3,4,5 static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128}; // Arrange third value for pixels 0,1,2,3,4,5 static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128}; // Scaling values for boxes of 3x2 and 2x2 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0}; // GCC versions of row functions are verbatim conversions from Visual C. // Generated using gcc disassembly on Visual C object file: // objdump -D yuvscaler.obj >yuvscaler.txt void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( // 16 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "psrlw $0x8,%%xmm0 \n" "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 ::"memory", "cc", "xmm0", "xmm1"); } void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pavgw %%xmm5,%%xmm0 \n" "pavgw %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 ::"memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x00(%0,%3,1),%%xmm2 \n" "movdqu 0x10(%0,%3,1),%%xmm3 \n" "lea 0x20(%0),%0 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm3,%%xmm1 \n" "psrlw $0x1,%%xmm0 \n" "psrlw $0x1,%%xmm1 \n" "pavgw %%xmm5,%%xmm0 \n" "pavgw %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #ifdef HAS_SCALEROWDOWN2_AVX2 void ScaleRowDown2_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile(LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 ::"memory", "cc", "xmm0", "xmm1"); } void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 ::"memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" "lea 0x40(%0),%0 \n" "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" "vpsrlw $0x1,%%ymm0,%%ymm0 \n" "vpsrlw $0x1,%%ymm1,%%ymm1 \n" "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" "vpavgw %%ymm5,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEROWDOWN2_AVX2 void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "pcmpeqb %%xmm5,%%xmm5 \n" "psrld $0x18,%%xmm5 \n" "pslld $0x10,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "psrlw $0x8,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 ::"memory", "cc", "xmm0", "xmm1", "xmm5"); } void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { intptr_t stridex3; asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n" "movdqa %%xmm4,%%xmm5 \n" "packuswb %%xmm4,%%xmm4 \n" "psllw $0x3,%%xmm5 \n" "lea 0x00(%4,%4,2),%3 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x00(%0,%4,1),%%xmm2 \n" "movdqu 0x10(%0,%4,1),%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm0 \n" "pmaddubsw %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm3,%%xmm1 \n" "movdqu 0x00(%0,%4,2),%%xmm2 \n" "movdqu 0x10(%0,%4,2),%%xmm3 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm3,%%xmm1 \n" "movdqu 0x00(%0,%3,1),%%xmm2 \n" "movdqu 0x10(%0,%3,1),%%xmm3 \n" "lea 0x20(%0),%0 \n" "pmaddubsw %%xmm4,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm3 \n" "paddw %%xmm2,%%xmm0 \n" "paddw %%xmm3,%%xmm1 \n" "phaddw %%xmm1,%%xmm0 \n" "paddw %%xmm5,%%xmm0 \n" "psrlw $0x4,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "=&r"(stridex3) // %3 : "r"((intptr_t)(src_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #ifdef HAS_SCALEROWDOWN4_AVX2 void ScaleRowDown4_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpsrld $0x18,%%ymm5,%%ymm5 \n" "vpslld $0x10,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "lea 0x40(%0),%0 \n" "vpand %%ymm5,%%ymm0,%%ymm0 \n" "vpand %%ymm5,%%ymm1,%%ymm1 \n" "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpsrlw $0x8,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vmovdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 ::"memory", "cc", "xmm0", "xmm1", "xmm5"); } void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsllw $0x3,%%ymm4,%%ymm5 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" "vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" "vmovdqu 0x20(%0,%3,1),%%ymm3 \n" "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n" "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" "vmovdqu 0x00(%0,%3,2),%%ymm2 \n" "vmovdqu 0x20(%0,%3,2),%%ymm3 \n" "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" "vmovdqu 0x00(%0,%4,1),%%ymm2 \n" "vmovdqu 0x20(%0,%4,1),%%ymm3 \n" "lea 0x40(%0),%0 \n" "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" "vpaddw %%ymm3,%%ymm1,%%ymm1 \n" "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" "vpsrlw $0x4,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vmovdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(src_stride * 3)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEROWDOWN4_AVX2 void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "movdqa %0,%%xmm3 \n" "movdqa %1,%%xmm4 \n" "movdqa %2,%%xmm5 \n" : : "m"(kShuf0), // %0 "m"(kShuf1), // %1 "m"(kShuf2) // %2 ); asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm2 \n" "lea 0x20(%0),%0 \n" "movdqa %%xmm2,%%xmm1 \n" "palignr $0x8,%%xmm0,%%xmm1 \n" "pshufb %%xmm3,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" "pshufb %%xmm5,%%xmm2 \n" "movq %%xmm0,(%1) \n" "movq %%xmm1,0x8(%1) \n" "movq %%xmm2,0x10(%1) \n" "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 ::"memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %2,%%xmm4 \n" // kShuf21 : : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); asm volatile( "movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %2,%%xmm1 \n" // kRound34 : : "m"(kMadd01), // %0 "m"(kMadd11), // %1 "m"(kRound34) // %2 ); asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm2,%%xmm6 \n" "pmaddubsw %%xmm5,%%xmm6 \n" "paddsw %%xmm1,%%xmm6 \n" "psrlw $0x2,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movq %%xmm6,(%1) \n" "movdqu 0x8(%0),%%xmm6 \n" "movdqu 0x8(%0,%3,1),%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm3,%%xmm6 \n" "pmaddubsw %%xmm0,%%xmm6 \n" "paddsw %%xmm1,%%xmm6 \n" "psrlw $0x2,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movq %%xmm6,0x8(%1) \n" "movdqu 0x10(%0),%%xmm6 \n" "movdqu 0x10(%0,%3,1),%%xmm7 \n" "lea 0x20(%0),%0 \n" "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm4,%%xmm6 \n" "pmaddubsw %4,%%xmm6 \n" "paddsw %%xmm1,%%xmm6 \n" "psrlw $0x2,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movq %%xmm6,0x10(%1) \n" "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "m"(kMadd21) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %2,%%xmm4 \n" // kShuf21 : : "m"(kShuf01), // %0 "m"(kShuf11), // %1 "m"(kShuf21) // %2 ); asm volatile( "movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %2,%%xmm1 \n" // kRound34 : : "m"(kMadd01), // %0 "m"(kMadd11), // %1 "m"(kRound34) // %2 ); asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n" "pavgb %%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm2,%%xmm6 \n" "pmaddubsw %%xmm5,%%xmm6 \n" "paddsw %%xmm1,%%xmm6 \n" "psrlw $0x2,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movq %%xmm6,(%1) \n" "movdqu 0x8(%0),%%xmm6 \n" "movdqu 0x8(%0,%3,1),%%xmm7 \n" "pavgb %%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm3,%%xmm6 \n" "pmaddubsw %%xmm0,%%xmm6 \n" "paddsw %%xmm1,%%xmm6 \n" "psrlw $0x2,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movq %%xmm6,0x8(%1) \n" "movdqu 0x10(%0),%%xmm6 \n" "movdqu 0x10(%0,%3,1),%%xmm7 \n" "lea 0x20(%0),%0 \n" "pavgb %%xmm6,%%xmm7 \n" "pavgb %%xmm7,%%xmm6 \n" "pshufb %%xmm4,%%xmm6 \n" "pmaddubsw %4,%%xmm6 \n" "paddsw %%xmm1,%%xmm6 \n" "psrlw $0x2,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movq %%xmm6,0x10(%1) \n" "lea 0x18(%1),%1 \n" "sub $0x18,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "m"(kMadd21) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "pshufb %%xmm4,%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" "paddusb %%xmm1,%%xmm0 \n" "movq %%xmm0,(%1) \n" "movhlps %%xmm0,%%xmm1 \n" "movd %%xmm1,0x8(%1) \n" "lea 0xc(%1),%1 \n" "sub $0xc,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "m"(kShuf38a), // %3 "m"(kShuf38b) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"); } void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "movdqa %0,%%xmm2 \n" "movdqa %1,%%xmm3 \n" "movdqa %2,%%xmm4 \n" "movdqa %3,%%xmm5 \n" : : "m"(kShufAb0), // %0 "m"(kShufAb1), // %1 "m"(kShufAb2), // %2 "m"(kScaleAb2) // %3 ); asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n" "lea 0x10(%0),%0 \n" "pavgb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm2,%%xmm1 \n" "movdqa %%xmm0,%%xmm6 \n" "pshufb %%xmm3,%%xmm6 \n" "paddusw %%xmm6,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" "paddusw %%xmm0,%%xmm1 \n" "pmulhuw %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" "movd %%xmm1,(%1) \n" "psrlq $0x10,%%xmm1 \n" "movd %%xmm1,0x2(%1) \n" "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "movdqa %0,%%xmm2 \n" "movdqa %1,%%xmm3 \n" "movdqa %2,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" : : "m"(kShufAc), // %0 "m"(kShufAc3), // %1 "m"(kScaleAc33) // %2 ); asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n" "movhlps %%xmm0,%%xmm1 \n" "movhlps %%xmm6,%%xmm7 \n" "punpcklbw %%xmm5,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm1 \n" "punpcklbw %%xmm5,%%xmm6 \n" "punpcklbw %%xmm5,%%xmm7 \n" "paddusw %%xmm6,%%xmm0 \n" "paddusw %%xmm7,%%xmm1 \n" "movdqu 0x00(%0,%3,2),%%xmm6 \n" "lea 0x10(%0),%0 \n" "movhlps %%xmm6,%%xmm7 \n" "punpcklbw %%xmm5,%%xmm6 \n" "punpcklbw %%xmm5,%%xmm7 \n" "paddusw %%xmm6,%%xmm0 \n" "paddusw %%xmm7,%%xmm1 \n" "movdqa %%xmm0,%%xmm6 \n" "psrldq $0x2,%%xmm0 \n" "paddusw %%xmm0,%%xmm6 \n" "psrldq $0x2,%%xmm0 \n" "paddusw %%xmm0,%%xmm6 \n" "pshufb %%xmm2,%%xmm6 \n" "movdqa %%xmm1,%%xmm7 \n" "psrldq $0x2,%%xmm1 \n" "paddusw %%xmm1,%%xmm7 \n" "psrldq $0x2,%%xmm1 \n" "paddusw %%xmm1,%%xmm7 \n" "pshufb %%xmm3,%%xmm7 \n" "paddusw %%xmm7,%%xmm6 \n" "pmulhuw %%xmm4,%%xmm6 \n" "packuswb %%xmm6,%%xmm6 \n" "movd %%xmm6,(%1) \n" "psrlq $0x10,%%xmm6 \n" "movd %%xmm6,0x2(%1) \n" "lea 0x6(%1),%1 \n" "sub $0x6,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } static const uvec8 kLinearShuffleFar = {2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3}; #ifdef HAS_SCALEROWUP2LINEAR_SSE2 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( "pxor %%xmm0,%%xmm0 \n" // 0 "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $1,%%xmm6 \n" // all 2 LABELALIGN "1: \n" "movq (%0),%%xmm1 \n" // 01234567 "movq 1(%0),%%xmm2 \n" // 12345678 "movdqa %%xmm1,%%xmm3 \n" "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 "movdqa %%xmm1,%%xmm4 \n" "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) "movdqa %%xmm2,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) "paddw %%xmm5,%%xmm4 \n" "movdqa %%xmm3,%%xmm5 \n" "paddw %%xmm6,%%xmm4 \n" "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) "paddw %%xmm5,%%xmm5 \n" "paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo) "psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo) "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) "paddw %%xmm2,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) "paddw %%xmm6,%%xmm1 \n" "paddw %%xmm3,%%xmm3 \n" "paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) "psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) "packuswb %%xmm1,%%xmm5 \n" "movdqu %%xmm5,(%1) \n" "lea 0x8(%0),%0 \n" "lea 0x10(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif #ifdef HAS_SCALEROWUP2BILINEAR_SSE2 void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( LABELALIGN "1: \n" "pxor %%xmm0,%%xmm0 \n" // 0 // above line "movq (%0),%%xmm1 \n" // 01234567 "movq 1(%0),%%xmm2 \n" // 12345678 "movdqa %%xmm1,%%xmm3 \n" "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 "punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 "movdqa %%xmm1,%%xmm4 \n" "punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16) "movdqa %%xmm2,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16) "paddw %%xmm5,%%xmm4 \n" // near+far "movdqa %%xmm3,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16) "paddw %%xmm5,%%xmm5 \n" // 2*near "paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo) "punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16) "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) "paddw %%xmm2,%%xmm1 \n" "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) "paddw %%xmm3,%%xmm3 \n" // 2*near "paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) // below line "movq (%0,%3),%%xmm6 \n" // 01234567 "movq 1(%0,%3),%%xmm2 \n" // 12345678 "movdqa %%xmm6,%%xmm3 \n" "punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778 "punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677 "punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788 "movdqa %%xmm6,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16) "movdqa %%xmm2,%%xmm7 \n" "punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16) "paddw %%xmm7,%%xmm5 \n" // near+far "movdqa %%xmm3,%%xmm7 \n" "punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16) "paddw %%xmm7,%%xmm7 \n" // 2*near "paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo) "punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16) "punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16) "paddw %%xmm6,%%xmm2 \n" // near+far "punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16) "paddw %%xmm3,%%xmm3 \n" // 2*near "paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi) // xmm4 xmm1 // xmm5 xmm2 "pcmpeqw %%xmm0,%%xmm0 \n" "psrlw $15,%%xmm0 \n" "psllw $3,%%xmm0 \n" // all 8 "movdqa %%xmm4,%%xmm3 \n" "movdqa %%xmm5,%%xmm6 \n" "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (1, lo) "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo) "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo) "paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo) "psrlw $4,%%xmm3 \n" // ^ div by 16 "movdqa %%xmm1,%%xmm7 \n" "movdqa %%xmm2,%%xmm6 \n" "paddw %%xmm7,%%xmm7 \n" // 6*near+2*far (1, hi) "paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi) "paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi) "paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi) "psrlw $4,%%xmm7 \n" // ^ div by 16 "packuswb %%xmm7,%%xmm3 \n" "movdqu %%xmm3,(%1) \n" // save above line "movdqa %%xmm5,%%xmm3 \n" "paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo) "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, lo) "paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo) "paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo) "psrlw $4,%%xmm5 \n" // ^ div by 16 "movdqa %%xmm2,%%xmm3 \n" "paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi) "paddw %%xmm3,%%xmm3 \n" // 6*near+2*far (2, hi) "paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) "paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi) "psrlw $4,%%xmm2 \n" // ^ div by 16 "packuswb %%xmm2,%%xmm5 \n" "movdqu %%xmm5,(%1,%4) \n" // save below line "lea 0x8(%0),%0 \n" "lea 0x10(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_SCALEROWUP2LINEAR_12_SSSE3 void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( "movdqa %3,%%xmm5 \n" "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" // 01234567 (16) "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) "movdqa %%xmm0,%%xmm2 \n" "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) "movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm5,%%xmm3 \n" // 54657687 (far) "pshufb %%xmm5,%%xmm1 \n" // 10213243 (far) "paddw %%xmm4,%%xmm1 \n" // far+2 "paddw %%xmm4,%%xmm3 \n" // far+2 "paddw %%xmm0,%%xmm1 \n" // near+far+2 "paddw %%xmm2,%%xmm3 \n" // near+far+2 "paddw %%xmm0,%%xmm0 \n" // 2*near "paddw %%xmm2,%%xmm2 \n" // 2*near "paddw %%xmm1,%%xmm0 \n" // 3*near+far+2 (lo) "paddw %%xmm3,%%xmm2 \n" // 3*near+far+2 (hi) "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far "movdqu %%xmm0,(%1) \n" "movdqu %%xmm2,16(%1) \n" "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "m"(kLinearShuffleFar) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif #ifdef HAS_SCALEROWUP2BILINEAR_12_SSSE3 void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( "pcmpeqw %%xmm7,%%xmm7 \n" "psrlw $15,%%xmm7 \n" "psllw $3,%%xmm7 \n" // all 8 "movdqa %5,%%xmm6 \n" LABELALIGN "1: \n" // above line "movdqu (%0),%%xmm0 \n" // 01234567 (16) "movdqu 2(%0),%%xmm1 \n" // 12345678 (16) "movdqa %%xmm0,%%xmm2 \n" "punpckhwd %%xmm1,%%xmm2 \n" // 45566778 (16) "punpcklwd %%xmm1,%%xmm0 \n" // 01122334 (16) "movdqa %%xmm2,%%xmm3 \n" "movdqa %%xmm0,%%xmm1 \n" "pshufb %%xmm6,%%xmm3 \n" // 54657687 (far) "pshufb %%xmm6,%%xmm1 \n" // 10213243 (far) "paddw %%xmm0,%%xmm1 \n" // near+far "paddw %%xmm2,%%xmm3 \n" // near+far "paddw %%xmm0,%%xmm0 \n" // 2*near "paddw %%xmm2,%%xmm2 \n" // 2*near "paddw %%xmm1,%%xmm0 \n" // 3*near+far (1, lo) "paddw %%xmm3,%%xmm2 \n" // 3*near+far (1, hi) // below line "movdqu (%0,%3,2),%%xmm1 \n" // 01234567 (16) "movdqu 2(%0,%3,2),%%xmm4 \n" // 12345678 (16) "movdqa %%xmm1,%%xmm3 \n" "punpckhwd %%xmm4,%%xmm3 \n" // 45566778 (16) "punpcklwd %%xmm4,%%xmm1 \n" // 01122334 (16) "movdqa %%xmm3,%%xmm5 \n" "movdqa %%xmm1,%%xmm4 \n" "pshufb %%xmm6,%%xmm5 \n" // 54657687 (far) "pshufb %%xmm6,%%xmm4 \n" // 10213243 (far) "paddw %%xmm1,%%xmm4 \n" // near+far "paddw %%xmm3,%%xmm5 \n" // near+far "paddw %%xmm1,%%xmm1 \n" // 2*near "paddw %%xmm3,%%xmm3 \n" // 2*near "paddw %%xmm4,%%xmm1 \n" // 3*near+far (2, lo) "paddw %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) // xmm0 xmm2 // xmm1 xmm3 "movdqa %%xmm0,%%xmm4 \n" "movdqa %%xmm1,%%xmm5 \n" "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, lo) "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, lo) "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) "psrlw $4,%%xmm4 \n" // ^ div by 16 "movdqu %%xmm4,(%1) \n" "movdqa %%xmm2,%%xmm4 \n" "movdqa %%xmm3,%%xmm5 \n" "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (1, hi) "paddw %%xmm7,%%xmm5 \n" // 3*near+far+8 (2, hi) "paddw %%xmm2,%%xmm4 \n" // 9*near+3*far (1, hi) "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, hi) "psrlw $4,%%xmm4 \n" // ^ div by 16 "movdqu %%xmm4,0x10(%1) \n" "movdqa %%xmm1,%%xmm4 \n" "paddw %%xmm7,%%xmm0 \n" // 3*near+far+8 (1, lo) "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, lo) "paddw %%xmm4,%%xmm1 \n" // 9*near+3*far (2, lo) "paddw %%xmm0,%%xmm1 \n" // 9 3 3 1 + 8 (2, lo) "psrlw $4,%%xmm1 \n" // ^ div by 16 "movdqu %%xmm1,(%1,%4,2) \n" "movdqa %%xmm3,%%xmm4 \n" "paddw %%xmm7,%%xmm2 \n" // 3*near+far+8 (1, hi) "paddw %%xmm4,%%xmm4 \n" // 6*near+2*far (2, hi) "paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (2, hi) "paddw %%xmm2,%%xmm3 \n" // 9 3 3 1 + 8 (2, hi) "psrlw $4,%%xmm3 \n" // ^ div by 16 "movdqu %%xmm3,0x10(%1,%4,2) \n" "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 "m"(kLinearShuffleFar) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif #ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( "pxor %%xmm5,%%xmm5 \n" "pcmpeqd %%xmm4,%%xmm4 \n" "psrld $31,%%xmm4 \n" "pslld $1,%%xmm4 \n" // all 2 LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" // 0123 (16b) "movq 2(%0),%%xmm1 \n" // 1234 (16b) "punpcklwd %%xmm5,%%xmm0 \n" // 0123 (32b) "punpcklwd %%xmm5,%%xmm1 \n" // 1234 (32b) "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm1,%%xmm3 \n" "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) "packssdw %%xmm1,%%xmm0 \n" "pshufd $0b11011000,%%xmm0,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x8(%0),%0 \n" "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif #ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( "pxor %%xmm7,%%xmm7 \n" "pcmpeqd %%xmm6,%%xmm6 \n" "psrld $31,%%xmm6 \n" "pslld $3,%%xmm6 \n" // all 8 LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm1,%%xmm3 \n" "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) "movq (%0),%%xmm0 \n" // 0123 (16b) "movq 2(%0),%%xmm1 \n" // 1234 (16b) "punpcklwd %%xmm7,%%xmm0 \n" // 0123 (32b) "punpcklwd %%xmm7,%%xmm1 \n" // 1234 (32b) "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm1,%%xmm3 \n" "pshufd $0b10110001,%%xmm2,%%xmm2 \n" // 1032 (even, far) "pshufd $0b10110001,%%xmm3,%%xmm3 \n" // 2143 (odd, far) "paddd %%xmm0,%%xmm2 \n" // near+far (lo) "paddd %%xmm1,%%xmm3 \n" // near+far (hi) "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) "movq (%0,%3,2),%%xmm2 \n" "movq 2(%0,%3,2),%%xmm3 \n" "punpcklwd %%xmm7,%%xmm2 \n" // 0123 (32b) "punpcklwd %%xmm7,%%xmm3 \n" // 1234 (32b) "movdqa %%xmm2,%%xmm4 \n" "movdqa %%xmm3,%%xmm5 \n" "pshufd $0b10110001,%%xmm4,%%xmm4 \n" // 1032 (even, far) "pshufd $0b10110001,%%xmm5,%%xmm5 \n" // 2143 (odd, far) "paddd %%xmm2,%%xmm4 \n" // near+far (lo) "paddd %%xmm3,%%xmm5 \n" // near+far (hi) "paddd %%xmm2,%%xmm2 \n" // 2*near (lo) "paddd %%xmm3,%%xmm3 \n" // 2*near (hi) "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) "movdqa %%xmm0,%%xmm4 \n" "movdqa %%xmm2,%%xmm5 \n" "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) "movdqa %%xmm2,%%xmm5 \n" "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) "movdqa %%xmm1,%%xmm0 \n" "movdqa %%xmm3,%%xmm2 \n" "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) "movdqa %%xmm3,%%xmm2 \n" "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) "packssdw %%xmm0,%%xmm4 \n" "pshufd $0b11011000,%%xmm4,%%xmm4 \n" "movdqu %%xmm4,(%1) \n" // store above "packssdw %%xmm2,%%xmm5 \n" "pshufd $0b11011000,%%xmm4,%%xmm4 \n" "movdqu %%xmm5,(%1,%4,2) \n" // store below "lea 0x8(%0),%0 \n" "lea 0x10(%1),%1 \n" // 4 pixel to 8 pixel "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif #ifdef HAS_SCALEROWUP2LINEAR_SSSE3 void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 "movdqa %3,%%xmm3 \n" LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" // 01234567 "movq 1(%0),%%xmm1 \n" // 12345678 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 "movdqa %%xmm0,%%xmm2 \n" "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi) "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo) "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) "vpackuswb %%xmm2,%%xmm0,%%xmm0 \n" "vmovdqu %%xmm0,(%1) \n" "lea 0x8(%0),%0 \n" "lea 0x10(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "m"(kLinearMadd31) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif #ifdef HAS_SCALEROWUP2BILINEAR_SSSE3 void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $3,%%xmm6 \n" // all 8 "movdqa %5,%%xmm7 \n" LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" // 01234567 "movq 1(%0),%%xmm1 \n" // 12345678 "punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767 "punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878 "movdqa %%xmm0,%%xmm2 \n" "punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878 "punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434 "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi) "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo) "movq (%0,%3),%%xmm1 \n" "movq 1(%0,%3),%%xmm4 \n" "punpcklwd %%xmm1,%%xmm1 \n" "punpcklwd %%xmm4,%%xmm4 \n" "movdqa %%xmm1,%%xmm3 \n" "punpckhdq %%xmm4,%%xmm3 \n" "punpckldq %%xmm4,%%xmm1 \n" "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) // xmm0 xmm2 // xmm1 xmm3 "movdqa %%xmm0,%%xmm4 \n" "movdqa %%xmm1,%%xmm5 \n" "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) "movdqa %%xmm1,%%xmm5 \n" "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) "movdqa %%xmm2,%%xmm0 \n" "movdqa %%xmm3,%%xmm1 \n" "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) "movdqa %%xmm3,%%xmm1 \n" "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) "packuswb %%xmm0,%%xmm4 \n" "movdqu %%xmm4,(%1) \n" // store above "packuswb %%xmm1,%%xmm5 \n" "movdqu %%xmm5,(%1,%4) \n" // store below "lea 0x8(%0),%0 \n" "lea 0x10(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 "m"(kLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_SCALEROWUP2LINEAR_AVX2 void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 "vbroadcastf128 %3,%%ymm3 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" "vpermq $0b11011000,%%ymm1,%%ymm1 \n" "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 16 sample to 32 sample "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "m"(kLinearMadd31) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif #ifdef HAS_SCALEROWUP2BILINEAR_AVX2 void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 "vbroadcastf128 %5,%%ymm7 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF "vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0 "vpermq $0b11011000,%%ymm0,%%ymm0 \n" "vpermq $0b11011000,%%ymm1,%%ymm1 \n" "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" "vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n" "vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" "vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF "vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 "vpermq $0b11011000,%%ymm2,%%ymm2 \n" "vpermq $0b11011000,%%ymm3,%%ymm3 \n" "vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n" "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" "vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n" "vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) // ymm0 ymm1 // ymm2 ymm3 "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" "vmovdqu %%ymm4,(%1) \n" // store above "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" "vmovdqu %%ymm5,(%1,%4) \n" // store below "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 16 sample to 32 sample "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 "m"(kLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_SCALEROWUP2LINEAR_12_AVX2 void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( "vbroadcastf128 %3,%%ymm5 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" // 0123456789ABCDEF (16b) "vmovdqu 2(%0),%%ymm1 \n" // 123456789ABCDEF0 (16b) "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 012389AB4567CDEF "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 12349ABC5678DEF0 "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" // 899AABBCCDDEEFF0 (near) "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) "vpshufb %%ymm5,%%ymm2,%%ymm3 \n" // 98A9BACBDCEDFE0F (far) "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // far+2 "vpaddw %%ymm4,%%ymm3,%%ymm3 \n" // far+2 "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far+2 "vpaddw %%ymm2,%%ymm3,%%ymm3 \n" // near+far+2 "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near "vpaddw %%ymm2,%%ymm2,%%ymm2 \n" // 2*near "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 3*near+far+2 "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 3*near+far+2 "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far "vpsrlw $2,%%ymm2,%%ymm2 \n" // 3/4*near+1/4*far "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm2,32(%1) \n" "lea 0x20(%0),%0 \n" "lea 0x40(%1),%1 \n" // 16 sample to 32 sample "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "m"(kLinearShuffleFar) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif #ifdef HAS_SCALEROWUP2BILINEAR_12_AVX2 void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( "vbroadcastf128 %5,%%ymm5 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $3,%%ymm4,%%ymm4 \n" // all 8 LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b) "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b) "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near "vpaddw %%ymm0,%%ymm1,%%ymm2 \n" // 3*near+far (1) "vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b) "vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b) "vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000 "vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000 "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" // 0112233445566778 (near) "vpshufb %%ymm5,%%ymm0,%%ymm1 \n" // 1021324354657687 (far) "vpaddw %%ymm0,%%ymm1,%%ymm1 \n" // near+far "vpaddw %%ymm0,%%ymm0,%%ymm0 \n" // 2*near "vpaddw %%ymm0,%%ymm1,%%ymm3 \n" // 3*near+far (2) "vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1) "vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2) "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1) "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1) "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 "vmovdqu %%ymm0,(%1) \n" // store above "vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2) "vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1) "vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2) "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2) "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 "vmovdqu %%ymm0,(%1,%4,2) \n" // store below "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 8 sample to 16 sample "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 "m"(kLinearShuffleFar) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif #ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n" "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" "vpshufd $0b11011000,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif #ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n" "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" // 01234567 (16b, 1u1v) "vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b, 1u1v) "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) "vpshufd $0b10110001,%%ymm0,%%ymm2 \n" // 10325476 (lo, far) "vpshufd $0b10110001,%%ymm1,%%ymm3 \n" // 21436587 (hi, far) "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (1, lo) "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (1, hi) "vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b, 1u1v) "vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b, 1u1v) "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) "vpshufd $0b10110001,%%ymm2,%%ymm4 \n" // 10325476 (lo, far) "vpshufd $0b10110001,%%ymm3,%%ymm5 \n" // 21436587 (hi, far) "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (2, lo) "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (2, hi) "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" "vpshufd $0b11011000,%%ymm4,%%ymm4 \n" "vmovdqu %%ymm4,(%1) \n" // store above "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" "vpshufd $0b11011000,%%ymm5,%%ymm5 \n" "vmovdqu %%ymm5,(%1,%4,2) \n" // store below "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 8 pixel to 16 pixel "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // Reads 16xN bytes and produces 16 shorts at a time. void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { asm volatile("pxor %%xmm5,%%xmm5 \n" // 16 pixel loop. LABELALIGN "1: \n" "movdqu (%0),%%xmm3 \n" "lea 0x10(%0),%0 \n" // src_ptr += 16 "movdqu (%1),%%xmm0 \n" "movdqu 0x10(%1),%%xmm1 \n" "movdqa %%xmm3,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm2 \n" "punpckhbw %%xmm5,%%xmm3 \n" "paddusw %%xmm2,%%xmm0 \n" "paddusw %%xmm3,%%xmm1 \n" "movdqu %%xmm0,(%1) \n" "movdqu %%xmm1,0x10(%1) \n" "lea 0x20(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%ymm3 \n" "lea 0x20(%0),%0 \n" // src_ptr += 32 "vpermq $0xd8,%%ymm3,%%ymm3 \n" "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" "vpaddusw (%1),%%ymm2,%%ymm0 \n" "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"); } #endif // HAS_SCALEADDROW_AVX2 // Constant for making pixels signed to avoid pmaddubsw // saturation. static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Constant for making pixels unsigned and adding .5 for rounding. static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) { intptr_t x0, x1, temp_pixel; asm volatile( "movd %6,%%xmm2 \n" "movd %7,%%xmm3 \n" "movl $0x04040000,%k2 \n" "movd %k2,%%xmm5 \n" "pcmpeqb %%xmm6,%%xmm6 \n" "psrlw $0x9,%%xmm6 \n" // 0x007f007f "pcmpeqb %%xmm7,%%xmm7 \n" "psrlw $15,%%xmm7 \n" // 0x00010001 "pextrw $0x1,%%xmm2,%k3 \n" "subl $0x2,%5 \n" "jl 29f \n" "movdqa %%xmm2,%%xmm0 \n" "paddd %%xmm3,%%xmm0 \n" "punpckldq %%xmm0,%%xmm2 \n" "punpckldq %%xmm3,%%xmm3 \n" "paddd %%xmm3,%%xmm3 \n" "pextrw $0x3,%%xmm2,%k4 \n" LABELALIGN "2: \n" "movdqa %%xmm2,%%xmm1 \n" "paddd %%xmm3,%%xmm2 \n" "movzwl 0x00(%1,%3,1),%k2 \n" "movd %k2,%%xmm0 \n" "psrlw $0x9,%%xmm1 \n" "movzwl 0x00(%1,%4,1),%k2 \n" "movd %k2,%%xmm4 \n" "pshufb %%xmm5,%%xmm1 \n" "punpcklwd %%xmm4,%%xmm0 \n" "psubb %8,%%xmm0 \n" // make pixels signed. "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + // 1 "paddusb %%xmm7,%%xmm1 \n" "pmaddubsw %%xmm0,%%xmm1 \n" "pextrw $0x1,%%xmm2,%k3 \n" "pextrw $0x3,%%xmm2,%k4 \n" "paddw %9,%%xmm1 \n" // make pixels unsigned. "psrlw $0x7,%%xmm1 \n" "packuswb %%xmm1,%%xmm1 \n" "movd %%xmm1,%k2 \n" "mov %w2,(%0) \n" "lea 0x2(%0),%0 \n" "subl $0x2,%5 \n" "jge 2b \n" LABELALIGN "29: \n" "addl $0x1,%5 \n" "jl 99f \n" "movzwl 0x00(%1,%3,1),%k2 \n" "movd %k2,%%xmm0 \n" "psrlw $0x9,%%xmm2 \n" "pshufb %%xmm5,%%xmm2 \n" "psubb %8,%%xmm0 \n" // make pixels signed. "pxor %%xmm6,%%xmm2 \n" "paddusb %%xmm7,%%xmm2 \n" "pmaddubsw %%xmm0,%%xmm2 \n" "paddw %9,%%xmm2 \n" // make pixels unsigned. "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm2 \n" "movd %%xmm2,%k2 \n" "mov %b2,(%0) \n" "99: \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "=&a"(temp_pixel), // %2 "=&r"(x0), // %3 "=&r"(x1), // %4 #if defined(__x86_64__) "+rm"(dst_width) // %5 #else "+m"(dst_width) // %5 #endif : "rm"(x), // %6 "rm"(dx), // %7 #if defined(__x86_64__) "x"(kFsub80), // %8 "x"(kFadd40) // %9 #else "m"(kFsub80), // %8 "m"(kFadd40) // %9 #endif : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. void ScaleColsUp2_SSE2(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) { (void)x; (void)dx; asm volatile(LABELALIGN "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" "movdqa %%xmm0,%%xmm1 \n" "punpcklbw %%xmm0,%%xmm0 \n" "punpckhbw %%xmm1,%%xmm1 \n" "movdqu %%xmm0,(%0) \n" "movdqu %%xmm1,0x10(%0) \n" "lea 0x20(%0),%0 \n" "sub $0x20,%2 \n" "jg 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width) // %2 ::"memory", "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { (void)src_stride; asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "shufps $0xdd,%%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 ::"memory", "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { (void)src_stride; asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" "movdqa %%xmm0,%%xmm2 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 ::"memory", "cc", "xmm0", "xmm1"); } void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { asm volatile(LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x00(%0,%3,1),%%xmm2 \n" "movdqu 0x10(%0,%3,1),%%xmm3 \n" "lea 0x20(%0),%0 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Reads 4 pixels at a time. // Alignment requirement: dst_argb 16 byte aligned. void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; (void)src_stride; asm volatile( "lea 0x00(,%1,4),%1 \n" "lea 0x00(%1,%1,2),%4 \n" LABELALIGN "1: \n" "movd (%0),%%xmm0 \n" "movd 0x00(%0,%1,1),%%xmm1 \n" "punpckldq %%xmm1,%%xmm0 \n" "movd 0x00(%0,%1,2),%%xmm2 \n" "movd 0x00(%0,%4,1),%%xmm3 \n" "lea 0x00(%0,%1,4),%0 \n" "punpckldq %%xmm3,%%xmm2 \n" "punpcklqdq %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 "+r"(dst_argb), // %2 "+r"(dst_width), // %3 "=&r"(src_stepx_x12) // %4 ::"memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } // Blends four 2x2 to 4x1. // Alignment requirement: dst_argb 16 byte aligned. void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12; intptr_t row1 = (intptr_t)(src_stride); asm volatile( "lea 0x00(,%1,4),%1 \n" "lea 0x00(%1,%1,2),%4 \n" "lea 0x00(%0,%5,1),%5 \n" LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" "movhps 0x00(%0,%1,1),%%xmm0 \n" "movq 0x00(%0,%1,2),%%xmm1 \n" "movhps 0x00(%0,%4,1),%%xmm1 \n" "lea 0x00(%0,%1,4),%0 \n" "movq (%5),%%xmm2 \n" "movhps 0x00(%5,%1,1),%%xmm2 \n" "movq 0x00(%5,%1,2),%%xmm3 \n" "movhps 0x00(%5,%4,1),%%xmm3 \n" "lea 0x00(%5,%1,4),%5 \n" "pavgb %%xmm2,%%xmm0 \n" "pavgb %%xmm3,%%xmm1 \n" "movdqa %%xmm0,%%xmm2 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" "shufps $0xdd,%%xmm1,%%xmm2 \n" "pavgb %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" "lea 0x10(%2),%2 \n" "sub $0x4,%3 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_stepx_x4), // %1 "+r"(dst_argb), // %2 "+rm"(dst_width), // %3 "=&r"(src_stepx_x12), // %4 "+r"(row1) // %5 ::"memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3"); } void ScaleARGBCols_SSE2(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { intptr_t x0, x1; asm volatile( "movd %5,%%xmm2 \n" "movd %6,%%xmm3 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n" "pshufd $0x11,%%xmm3,%%xmm0 \n" "paddd %%xmm0,%%xmm2 \n" "paddd %%xmm3,%%xmm3 \n" "pshufd $0x5,%%xmm3,%%xmm0 \n" "paddd %%xmm0,%%xmm2 \n" "paddd %%xmm3,%%xmm3 \n" "pshufd $0x0,%%xmm3,%%xmm3 \n" "pextrw $0x1,%%xmm2,%k0 \n" "pextrw $0x3,%%xmm2,%k1 \n" "cmp $0x0,%4 \n" "jl 99f \n" "sub $0x4,%4 \n" "jl 49f \n" LABELALIGN "40: \n" "movd 0x00(%3,%0,4),%%xmm0 \n" "movd 0x00(%3,%1,4),%%xmm1 \n" "pextrw $0x5,%%xmm2,%k0 \n" "pextrw $0x7,%%xmm2,%k1 \n" "paddd %%xmm3,%%xmm2 \n" "punpckldq %%xmm1,%%xmm0 \n" "movd 0x00(%3,%0,4),%%xmm1 \n" "movd 0x00(%3,%1,4),%%xmm4 \n" "pextrw $0x1,%%xmm2,%k0 \n" "pextrw $0x3,%%xmm2,%k1 \n" "punpckldq %%xmm4,%%xmm1 \n" "punpcklqdq %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%2) \n" "lea 0x10(%2),%2 \n" "sub $0x4,%4 \n" "jge 40b \n" "49: \n" "test $0x2,%4 \n" "je 29f \n" "movd 0x00(%3,%0,4),%%xmm0 \n" "movd 0x00(%3,%1,4),%%xmm1 \n" "pextrw $0x5,%%xmm2,%k0 \n" "punpckldq %%xmm1,%%xmm0 \n" "movq %%xmm0,(%2) \n" "lea 0x8(%2),%2 \n" "29: \n" "test $0x1,%4 \n" "je 99f \n" "movd 0x00(%3,%0,4),%%xmm0 \n" "movd %%xmm0,(%2) \n" "99: \n" : "=&a"(x0), // %0 "=&d"(x1), // %1 "+r"(dst_argb), // %2 "+r"(src_argb), // %3 "+r"(dst_width) // %4 : "rm"(x), // %5 "rm"(dx) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { (void)x; (void)dx; asm volatile(LABELALIGN "1: \n" "movdqu (%1),%%xmm0 \n" "lea 0x10(%1),%1 \n" "movdqa %%xmm0,%%xmm1 \n" "punpckldq %%xmm0,%%xmm0 \n" "punpckhdq %%xmm1,%%xmm1 \n" "movdqu %%xmm0,(%0) \n" "movdqu %%xmm1,0x10(%0) \n" "lea 0x20(%0),%0 \n" "sub $0x8,%2 \n" "jg 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width) // %2 ::"memory", "cc", "xmm0", "xmm1"); } // Shuffle table for arranging 2 pixels into pairs for pmaddubsw static const uvec8 kShuffleColARGB = { 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel }; // Shuffle table for duplicating 2 fractions into 8 bytes each static const uvec8 kShuffleFractions = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { intptr_t x0, x1; asm volatile( "movdqa %0,%%xmm4 \n" "movdqa %1,%%xmm5 \n" : : "m"(kShuffleColARGB), // %0 "m"(kShuffleFractions) // %1 ); asm volatile( "movd %5,%%xmm2 \n" "movd %6,%%xmm3 \n" "pcmpeqb %%xmm6,%%xmm6 \n" "psrlw $0x9,%%xmm6 \n" "pextrw $0x1,%%xmm2,%k3 \n" "sub $0x2,%2 \n" "jl 29f \n" "movdqa %%xmm2,%%xmm0 \n" "paddd %%xmm3,%%xmm0 \n" "punpckldq %%xmm0,%%xmm2 \n" "punpckldq %%xmm3,%%xmm3 \n" "paddd %%xmm3,%%xmm3 \n" "pextrw $0x3,%%xmm2,%k4 \n" LABELALIGN "2: \n" "movdqa %%xmm2,%%xmm1 \n" "paddd %%xmm3,%%xmm2 \n" "movq 0x00(%1,%3,4),%%xmm0 \n" "psrlw $0x9,%%xmm1 \n" "movhps 0x00(%1,%4,4),%%xmm0 \n" "pshufb %%xmm5,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" "pxor %%xmm6,%%xmm1 \n" "pmaddubsw %%xmm1,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "pextrw $0x1,%%xmm2,%k3 \n" "pextrw $0x3,%%xmm2,%k4 \n" "packuswb %%xmm0,%%xmm0 \n" "movq %%xmm0,(%0) \n" "lea 0x8(%0),%0 \n" "sub $0x2,%2 \n" "jge 2b \n" LABELALIGN "29: \n" "add $0x1,%2 \n" "jl 99f \n" "psrlw $0x9,%%xmm2 \n" "movq 0x00(%1,%3,4),%%xmm0 \n" "pshufb %%xmm5,%%xmm2 \n" "pshufb %%xmm4,%%xmm0 \n" "pxor %%xmm6,%%xmm2 \n" "pmaddubsw %%xmm2,%%xmm0 \n" "psrlw $0x7,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movd %%xmm0,(%0) \n" LABELALIGN "99: \n" // clang-format error. : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+rm"(dst_width), // %2 "=&r"(x0), // %3 "=&r"(x1) // %4 : "rm"(x), // %5 "rm"(dx) // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } // Divide num by div and return as 16.16 fixed point result. int FixedDiv_X86(int num, int div) { asm volatile( "cdq \n" "shld $0x10,%%eax,%%edx \n" "shl $0x10,%%eax \n" "idiv %1 \n" "mov %0, %%eax \n" : "+a"(num) // %0 : "c"(div) // %1 : "memory", "cc", "edx"); return num; } // Divide num - 1 by div - 1 and return as 16.16 fixed point result. int FixedDiv1_X86(int num, int div) { asm volatile( "cdq \n" "shld $0x10,%%eax,%%edx \n" "shl $0x10,%%eax \n" "sub $0x10001,%%eax \n" "sbb $0x0,%%edx \n" "sub $0x1,%1 \n" "idiv %1 \n" "mov %0, %%eax \n" : "+a"(num) // %0 : "c"(div) // %1 : "memory", "cc", "edx"); return num; } #ifdef HAS_SCALEUVROWDOWN2BOX_SSSE3 // Shuffle table for splitting UV into upper and lower part of register. static const uvec8 kShuffleSplitUV = {0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u, 1u, 3u, 5u, 7u, 9u, 11u, 13u, 15u}; static const uvec8 kShuffleMergeUV = {0u, 8u, 2u, 10u, 4u, 12u, 6u, 14u, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5, %%xmm5 \n" // zero "movdqa %4,%%xmm1 \n" // split shuffler "movdqa %5,%%xmm3 \n" // merge shuffler LABELALIGN "1: \n" "movdqu (%0),%%xmm0 \n" // 8 UV row 0 "movdqu 0x00(%0,%3,1),%%xmm2 \n" // 8 UV row 1 "lea 0x10(%0),%0 \n" "pshufb %%xmm1,%%xmm0 \n" // uuuuvvvv "pshufb %%xmm1,%%xmm2 \n" "pmaddubsw %%xmm4,%%xmm0 \n" // horizontal add "pmaddubsw %%xmm4,%%xmm2 \n" "paddw %%xmm2,%%xmm0 \n" // vertical add "psrlw $0x1,%%xmm0 \n" // round "pavgw %%xmm5,%%xmm0 \n" "pshufb %%xmm3,%%xmm0 \n" // merge uv "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" // 4 UV "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "m"(kShuffleSplitUV), // %4 "m"(kShuffleMergeUV) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEUVROWDOWN2BOX_SSSE3 #ifdef HAS_SCALEUVROWDOWN2BOX_AVX2 void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero "vbroadcastf128 %4,%%ymm1 \n" // split shuffler "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler LABELALIGN "1: \n" "vmovdqu (%0),%%ymm0 \n" // 16 UV row 0 "vmovdqu 0x00(%0,%3,1),%%ymm2 \n" // 16 UV row 1 "lea 0x20(%0),%0 \n" "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv "vpshufb %%ymm1,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n" // horizontal add "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n" "vpaddw %%ymm2,%%ymm0,%%ymm0 \n" // vertical add "vpsrlw $0x1,%%ymm0,%%ymm0 \n" // round "vpavgw %%ymm5,%%ymm0,%%ymm0 \n" "vpshufb %%ymm3,%%ymm0,%%ymm0 \n" // merge uv "vpermq $0xd8,%%ymm0,%%ymm0 \n" // combine qwords "vmovdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" // 8 UV "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "m"(kShuffleSplitUV), // %4 "m"(kShuffleMergeUV) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_SCALEUVROWDOWN2BOX_AVX2 static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3, 1, 1, 3, 1, 3}; #ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( "pcmpeqw %%xmm4,%%xmm4 \n" "psrlw $15,%%xmm4 \n" "psllw $1,%%xmm4 \n" // all 2 "movdqa %3,%%xmm3 \n" LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" // 00112233 (1u1v) "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) "movdqa %%xmm0,%%xmm2 \n" "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi) "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo) "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) "packuswb %%xmm2,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x8(%0),%0 \n" "lea 0x10(%1),%1 \n" // 4 uv to 8 uv "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "m"(kUVLinearMadd31) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( "pcmpeqw %%xmm6,%%xmm6 \n" "psrlw $15,%%xmm6 \n" "psllw $3,%%xmm6 \n" // all 8 "movdqa %5,%%xmm7 \n" LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" // 00112233 (1u1v) "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) "movdqa %%xmm0,%%xmm2 \n" "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi) "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo) "movq (%0,%3),%%xmm1 \n" "movq 2(%0,%3),%%xmm4 \n" "punpcklbw %%xmm4,%%xmm1 \n" "movdqa %%xmm1,%%xmm3 \n" "punpckhdq %%xmm1,%%xmm3 \n" "punpckldq %%xmm1,%%xmm1 \n" "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) // xmm0 xmm2 // xmm1 xmm3 "movdqa %%xmm0,%%xmm4 \n" "movdqa %%xmm1,%%xmm5 \n" "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) "movdqa %%xmm1,%%xmm5 \n" "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) "movdqa %%xmm2,%%xmm0 \n" "movdqa %%xmm3,%%xmm1 \n" "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) "movdqa %%xmm3,%%xmm1 \n" "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) "packuswb %%xmm0,%%xmm4 \n" "movdqu %%xmm4,(%1) \n" // store above "packuswb %%xmm1,%%xmm5 \n" "movdqu %%xmm5,(%1,%4) \n" // store below "lea 0x8(%0),%0 \n" "lea 0x10(%1),%1 \n" // 4 uv to 8 uv "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 "m"(kUVLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 "vbroadcastf128 %3,%%ymm3 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" "vmovdqu 2(%0),%%xmm1 \n" "vpermq $0b11011000,%%ymm0,%%ymm0 \n" "vpermq $0b11011000,%%ymm1,%%ymm1 \n" "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 8 uv to 16 uv "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "m"(kUVLinearMadd31) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2 void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 "vbroadcastf128 %5,%%ymm7 \n" LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" "vmovdqu 2(%0),%%xmm1 \n" "vpermq $0b11011000,%%ymm0,%%ymm0 \n" "vpermq $0b11011000,%%ymm1,%%ymm1 \n" "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 "vpermq $0b11011000,%%ymm2,%%ymm2 \n" "vpermq $0b11011000,%%ymm3,%%ymm3 \n" "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n" "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n" "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) // ymm0 ymm1 // ymm2 ymm3 "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" "vmovdqu %%ymm4,(%1) \n" // store above "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" "vmovdqu %%ymm5,(%1,%4) \n" // store below "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 8 uv to 16 uv "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)), // %4 "m"(kUVLinearMadd31) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2 void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( "pxor %%xmm5,%%xmm5 \n" "pcmpeqd %%xmm4,%%xmm4 \n" "psrld $31,%%xmm4 \n" "pslld $1,%%xmm4 \n" // all 2 LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) "punpcklwd %%xmm5,%%xmm0 \n" // 0011 (32b, 1u1v) "punpcklwd %%xmm5,%%xmm1 \n" // 1122 (32b, 1u1v) "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm1,%%xmm3 \n" "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (lo, far) "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (hi, far) "paddd %%xmm4,%%xmm2 \n" // far+2 (lo) "paddd %%xmm4,%%xmm3 \n" // far+2 (hi) "paddd %%xmm0,%%xmm2 \n" // near+far+2 (lo) "paddd %%xmm1,%%xmm3 \n" // near+far+2 (hi) "paddd %%xmm0,%%xmm0 \n" // 2*near (lo) "paddd %%xmm1,%%xmm1 \n" // 2*near (hi) "paddd %%xmm2,%%xmm0 \n" // 3*near+far+2 (lo) "paddd %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi) "psrld $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) "psrld $2,%%xmm1 \n" // 3/4*near+1/4*far (hi) "packusdw %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x8(%0),%0 \n" "lea 0x10(%1),%1 \n" // 2 uv to 4 uv "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2 void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( "pxor %%xmm7,%%xmm7 \n" "pcmpeqd %%xmm6,%%xmm6 \n" "psrld $31,%%xmm6 \n" "pslld $3,%%xmm6 \n" // all 8 LABELALIGN "1: \n" "movq (%0),%%xmm0 \n" // 0011 (16b, 1u1v) "movq 4(%0),%%xmm1 \n" // 1122 (16b, 1u1v) "punpcklwd %%xmm7,%%xmm0 \n" // 0011 (near) (32b, 1u1v) "punpcklwd %%xmm7,%%xmm1 \n" // 1122 (near) (32b, 1u1v) "movdqa %%xmm0,%%xmm2 \n" "movdqa %%xmm1,%%xmm3 \n" "pshufd $0b01001110,%%xmm2,%%xmm2 \n" // 1100 (far) (1, lo) "pshufd $0b01001110,%%xmm3,%%xmm3 \n" // 2211 (far) (1, hi) "paddd %%xmm0,%%xmm2 \n" // near+far (1, lo) "paddd %%xmm1,%%xmm3 \n" // near+far (1, hi) "paddd %%xmm0,%%xmm0 \n" // 2*near (1, lo) "paddd %%xmm1,%%xmm1 \n" // 2*near (1, hi) "paddd %%xmm2,%%xmm0 \n" // 3*near+far (1, lo) "paddd %%xmm3,%%xmm1 \n" // 3*near+far (1, hi) "movq (%0,%3,2),%%xmm2 \n" "movq 4(%0,%3,2),%%xmm3 \n" "punpcklwd %%xmm7,%%xmm2 \n" "punpcklwd %%xmm7,%%xmm3 \n" "movdqa %%xmm2,%%xmm4 \n" "movdqa %%xmm3,%%xmm5 \n" "pshufd $0b01001110,%%xmm4,%%xmm4 \n" // 1100 (far) (2, lo) "pshufd $0b01001110,%%xmm5,%%xmm5 \n" // 2211 (far) (2, hi) "paddd %%xmm2,%%xmm4 \n" // near+far (2, lo) "paddd %%xmm3,%%xmm5 \n" // near+far (2, hi) "paddd %%xmm2,%%xmm2 \n" // 2*near (2, lo) "paddd %%xmm3,%%xmm3 \n" // 2*near (2, hi) "paddd %%xmm4,%%xmm2 \n" // 3*near+far (2, lo) "paddd %%xmm5,%%xmm3 \n" // 3*near+far (2, hi) "movdqa %%xmm0,%%xmm4 \n" "movdqa %%xmm2,%%xmm5 \n" "paddd %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) "paddd %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) "paddd %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) "paddd %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) "psrld $4,%%xmm4 \n" // ^ div by 16 (1, lo) "movdqa %%xmm2,%%xmm5 \n" "paddd %%xmm2,%%xmm5 \n" // 6*near+2*far (2, lo) "paddd %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) "paddd %%xmm2,%%xmm5 \n" // 9*near+3*far (2, lo) "paddd %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) "psrld $4,%%xmm5 \n" // ^ div by 16 (2, lo) "movdqa %%xmm1,%%xmm0 \n" "movdqa %%xmm3,%%xmm2 \n" "paddd %%xmm1,%%xmm0 \n" // 6*near+2*far (1, hi) "paddd %%xmm6,%%xmm2 \n" // 3*near+far+8 (2, hi) "paddd %%xmm1,%%xmm0 \n" // 9*near+3*far (1, hi) "paddd %%xmm2,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) "psrld $4,%%xmm0 \n" // ^ div by 16 (1, hi) "movdqa %%xmm3,%%xmm2 \n" "paddd %%xmm3,%%xmm2 \n" // 6*near+2*far (2, hi) "paddd %%xmm6,%%xmm1 \n" // 3*near+far+8 (1, hi) "paddd %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi) "paddd %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi) "psrld $4,%%xmm2 \n" // ^ div by 16 (2, hi) "packusdw %%xmm0,%%xmm4 \n" "movdqu %%xmm4,(%1) \n" // store above "packusdw %%xmm2,%%xmm5 \n" "movdqu %%xmm5,(%1,%4,2) \n" // store below "lea 0x8(%0),%0 \n" "lea 0x10(%1),%1 \n" // 2 uv to 4 uv "sub $0x4,%2 \n" "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif #ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2 void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { asm volatile( "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n" "vpslld $1,%%ymm4,%%ymm4 \n" // all 2 LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) "vpaddd %%ymm4,%%ymm2,%%ymm2 \n" // far+2 (lo) "vpaddd %%ymm4,%%ymm3,%%ymm3 \n" // far+2 (hi) "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far+2 (lo) "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far+2 (hi) "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far+2 (lo) "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far+2 (hi) "vpsrld $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) "vpsrld $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" "vmovdqu %%ymm0,(%1) \n" "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 4 uv to 8 uv "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2 void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { asm volatile( "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n" "vpslld $3,%%ymm6,%%ymm6 \n" // all 8 LABELALIGN "1: \n" "vmovdqu (%0),%%xmm0 \n" // 00112233 (16b, 1u1v) "vmovdqu 4(%0),%%xmm1 \n" // 11223344 (16b, 1u1v) "vpmovzxwd %%xmm0,%%ymm0 \n" // 01234567 (32b, 1u1v) "vpmovzxwd %%xmm1,%%ymm1 \n" // 12345678 (32b, 1u1v) "vpshufd $0b01001110,%%ymm0,%%ymm2 \n" // 11003322 (lo, far) "vpshufd $0b01001110,%%ymm1,%%ymm3 \n" // 22114433 (hi, far) "vpaddd %%ymm0,%%ymm2,%%ymm2 \n" // near+far (lo) "vpaddd %%ymm1,%%ymm3,%%ymm3 \n" // near+far (hi) "vpaddd %%ymm0,%%ymm0,%%ymm0 \n" // 2*near (lo) "vpaddd %%ymm1,%%ymm1,%%ymm1 \n" // 2*near (hi) "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 3*near+far (lo) "vpaddd %%ymm1,%%ymm3,%%ymm1 \n" // 3*near+far (hi) "vmovdqu (%0,%3,2),%%xmm2 \n" // 00112233 (16b, 1u1v) "vmovdqu 4(%0,%3,2),%%xmm3 \n" // 11223344 (16b, 1u1v) "vpmovzxwd %%xmm2,%%ymm2 \n" // 01234567 (32b, 1u1v) "vpmovzxwd %%xmm3,%%ymm3 \n" // 12345678 (32b, 1u1v) "vpshufd $0b01001110,%%ymm2,%%ymm4 \n" // 11003322 (lo, far) "vpshufd $0b01001110,%%ymm3,%%ymm5 \n" // 22114433 (hi, far) "vpaddd %%ymm2,%%ymm4,%%ymm4 \n" // near+far (lo) "vpaddd %%ymm3,%%ymm5,%%ymm5 \n" // near+far (hi) "vpaddd %%ymm2,%%ymm2,%%ymm2 \n" // 2*near (lo) "vpaddd %%ymm3,%%ymm3,%%ymm3 \n" // 2*near (hi) "vpaddd %%ymm2,%%ymm4,%%ymm2 \n" // 3*near+far (lo) "vpaddd %%ymm3,%%ymm5,%%ymm3 \n" // 3*near+far (hi) "vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) "vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) "vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) "vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) "vpsrld $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) "vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) "vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) "vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) "vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) "vpsrld $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) "vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) "vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) "vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) "vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) "vpsrld $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) "vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) "vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) "vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) "vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) "vpsrld $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) "vpackusdw %%ymm0,%%ymm4,%%ymm4 \n" "vmovdqu %%ymm4,(%1) \n" // store above "vpackusdw %%ymm2,%%ymm5,%%ymm5 \n" "vmovdqu %%ymm5,(%1,%4,2) \n" // store below "lea 0x10(%0),%0 \n" "lea 0x20(%1),%1 \n" // 4 uv to 8 uv "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"((intptr_t)(src_stride)), // %3 "r"((intptr_t)(dst_stride)) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/scale_mmi.cc000066400000000000000000001600551416500237200212750ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/scale.h" #include #include #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" // For CopyARGB #include "libyuv/row.h" #include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for Mips MMI. #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) // clang-format off // CPU agnostic row functions void ScaleRowDown2_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { (void)src_stride; uint64_t src0, src1, dest; const uint64_t shift = 0x8ULL; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "psrlh %[src0], %[src0], %[shift] \n\t" "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "psrlh %[src1], %[src1], %[shift] \n\t" "packushb %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), [shift] "f"(shift) : "memory"); } void ScaleRowDown2Linear_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { (void)src_stride; uint64_t src0, src1; uint64_t dest, dest0, dest1; const uint64_t mask = 0x00ff00ff00ff00ffULL; const uint64_t shift = 0x8ULL; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "and %[dest0], %[src0], %[mask] \n\t" "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "and %[dest1], %[src1], %[mask] \n\t" "packushb %[dest0], %[dest0], %[dest1] \n\t" "psrlh %[src0], %[src0], %[shift] \n\t" "psrlh %[src1], %[src1], %[shift] \n\t" "packushb %[dest1], %[src0], %[src1] \n\t" "pavgb %[dest], %[dest0], %[dest1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [mask] "f"(mask), [shift] "f"(shift), [width] "r"(dst_width) : "memory"); } void ScaleRowDown2Box_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; uint64_t s0, s1, t0, t1; uint64_t dest, dest0, dest1; const uint64_t ph = 0x0002000200020002ULL; const uint64_t mask = 0x00ff00ff00ff00ffULL; const uint64_t shift0 = 0x2ULL; const uint64_t shift1 = 0x8ULL; __asm__ volatile( "1: \n\t" "gsldrc1 %[s0], 0x00(%[s]) \n\t" "gsldlc1 %[s0], 0x07(%[s]) \n\t" "psrlh %[s1], %[s0], %[shift1] \n\t" "and %[s0], %[s0], %[mask] \n\t" "gsldrc1 %[t0], 0x00(%[t]) \n\t" "gsldlc1 %[t0], 0x07(%[t]) \n\t" "psrlh %[t1], %[t0], %[shift1] \n\t" "and %[t0], %[t0], %[mask] \n\t" "paddh %[dest0], %[s0], %[s1] \n\t" "paddh %[dest0], %[dest0], %[t0] \n\t" "paddh %[dest0], %[dest0], %[t1] \n\t" "paddh %[dest0], %[dest0], %[ph] \n\t" "psrlh %[dest0], %[dest0], %[shift0] \n\t" "gsldrc1 %[s0], 0x08(%[s]) \n\t" "gsldlc1 %[s0], 0x0f(%[s]) \n\t" "psrlh %[s1], %[s0], %[shift1] \n\t" "and %[s0], %[s0], %[mask] \n\t" "gsldrc1 %[t0], 0x08(%[t]) \n\t" "gsldlc1 %[t0], 0x0f(%[t]) \n\t" "psrlh %[t1], %[t0], %[shift1] \n\t" "and %[t0], %[t0], %[mask] \n\t" "paddh %[dest1], %[s0], %[s1] \n\t" "paddh %[dest1], %[dest1], %[t0] \n\t" "paddh %[dest1], %[dest1], %[t1] \n\t" "paddh %[dest1], %[dest1], %[ph] \n\t" "psrlh %[dest1], %[dest1], %[shift0] \n\t" "packushb %[dest], %[dest0], %[dest1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[s], %[s], 0x10 \n\t" "daddiu %[t], %[t], 0x10 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest) : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), [mask] "f"(mask) : "memory"); } void ScaleARGBRowDown2_MMI(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { (void)src_stride; const uint32_t* src = (const uint32_t*)(src_argb); uint32_t* dst = (uint32_t*)(dst_argb); uint64_t src0, src1, dest; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "punpckhwd %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width) : "memory"); } void ScaleARGBRowDown2Linear_MMI(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { (void)src_stride; uint64_t src0, src1; uint64_t dest, dest_hi, dest_lo; __asm__ volatile( "1: \n\t" "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" "lwc1 %[src1], 0x08(%[src_ptr]) \n\t" "punpcklwd %[dest_lo], %[src0], %[src1] \n\t" "lwc1 %[src0], 0x04(%[src_ptr]) \n\t" "lwc1 %[src1], 0x0c(%[src_ptr]) \n\t" "punpcklwd %[dest_hi], %[src0], %[src1] \n\t" "pavgb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) : "memory"); } void ScaleARGBRowDown2Box_MMI(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { const uint8_t* s = src_argb; const uint8_t* t = src_argb + src_stride; uint64_t s0, s_hi, s_lo; uint64_t t0, t_hi, t_lo; uint64_t dest, dest_hi, dest_lo; const uint64_t mask = 0x0ULL; const uint64_t ph = 0x0002000200020002ULL; const uint64_t shfit = 0x2ULL; __asm__ volatile( "1: \n\t" "gsldrc1 %[s0], 0x00(%[s]) \n\t" "gsldlc1 %[s0], 0x07(%[s]) \n\t" "punpcklbh %[s_lo], %[s0], %[mask] \n\t" "punpckhbh %[s_hi], %[s0], %[mask] \n\t" "paddh %[dest_lo], %[s_lo], %[s_hi] \n\t" "gsldrc1 %[t0], 0x00(%[t]) \n\t" "gsldlc1 %[t0], 0x07(%[t]) \n\t" "punpcklbh %[t_lo], %[t0], %[mask] \n\t" "punpckhbh %[t_hi], %[t0], %[mask] \n\t" "paddh %[dest_lo], %[dest_lo], %[t_lo] \n\t" "paddh %[dest_lo], %[dest_lo], %[t_hi] \n\t" "paddh %[dest_lo], %[dest_lo], %[ph] \n\t" "psrlh %[dest_lo], %[dest_lo], %[shfit] \n\t" "gsldrc1 %[s0], 0x08(%[s]) \n\t" "gsldlc1 %[s0], 0x0f(%[s]) \n\t" "punpcklbh %[s_lo], %[s0], %[mask] \n\t" "punpckhbh %[s_hi], %[s0], %[mask] \n\t" "paddh %[dest_hi], %[s_lo], %[s_hi] \n\t" "gsldrc1 %[t0], 0x08(%[t]) \n\t" "gsldlc1 %[t0], 0x0f(%[t]) \n\t" "punpcklbh %[t_lo], %[t0], %[mask] \n\t" "punpckhbh %[t_hi], %[t0], %[mask] \n\t" "paddh %[dest_hi], %[dest_hi], %[t_lo] \n\t" "paddh %[dest_hi], %[dest_hi], %[t_hi] \n\t" "paddh %[dest_hi], %[dest_hi], %[ph] \n\t" "psrlh %[dest_hi], %[dest_hi], %[shfit] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[s], %[s], 0x10 \n\t" "daddiu %[t], %[t], 0x10 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [s0] "=&f"(s0), [t0] "=&f"(t0), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest] "=&f"(dest) : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), [mask] "f"(mask), [ph] "f"(ph), [shfit] "f"(shfit) : "memory"); } void ScaleRowDown2_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { (void)src_stride; uint64_t src0, src1, dest; const uint64_t shift = 0x10ULL; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "psrlw %[src0], %[src0], %[shift] \n\t" "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "psrlw %[src1], %[src1], %[shift] \n\t" "packsswh %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), [shift] "f"(shift) : "memory"); } void ScaleRowDown2Linear_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { (void)src_stride; uint64_t src0, src1; uint64_t dest, dest_hi, dest_lo; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "punpcklhw %[dest_lo], %[src0], %[src1] \n\t" "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" "punpcklhw %[src0], %[dest_lo], %[dest_hi] \n\t" "punpckhhw %[src1], %[dest_lo], %[dest_hi] \n\t" "pavgh %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width) : "memory"); } void ScaleRowDown2Box_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { const uint16_t* s = src_ptr; const uint16_t* t = src_ptr + src_stride; uint64_t s0, s1, s_hi, s_lo; uint64_t t0, t1, t_hi, t_lo; uint64_t dest, dest0, dest1; const uint64_t ph = 0x0000000200000002ULL; const uint64_t mask = 0x0000ffff0000ffffULL; const uint64_t shift0 = 0x10ULL; const uint64_t shift1 = 0x2ULL; __asm__ volatile( "1: \n\t" "gsldrc1 %[s0], 0x00(%[s]) \n\t" "gsldlc1 %[s0], 0x07(%[s]) \n\t" "psrlw %[s1], %[s0], %[shift0] \n\t" "and %[s0], %[s0], %[mask] \n\t" "gsldrc1 %[t0], 0x00(%[t]) \n\t" "gsldlc1 %[t0], 0x07(%[t]) \n\t" "psrlw %[t1], %[t0], %[shift0] \n\t" "and %[t0], %[t0], %[mask] \n\t" "paddw %[dest0], %[s0], %[s1] \n\t" "paddw %[dest0], %[dest0], %[t0] \n\t" "paddw %[dest0], %[dest0], %[t1] \n\t" "paddw %[dest0], %[dest0], %[ph] \n\t" "psrlw %[dest0], %[dest0], %[shift1] \n\t" "gsldrc1 %[s0], 0x08(%[s]) \n\t" "gsldlc1 %[s0], 0x0f(%[s]) \n\t" "psrlw %[s1], %[s0], %[shift0] \n\t" "and %[s0], %[s0], %[mask] \n\t" "gsldrc1 %[t0], 0x08(%[t]) \n\t" "gsldlc1 %[t0], 0x0f(%[t]) \n\t" "psrlw %[t1], %[t0], %[shift0] \n\t" "and %[t0], %[t0], %[mask] \n\t" "paddw %[dest1], %[s0], %[s1] \n\t" "paddw %[dest1], %[dest1], %[t0] \n\t" "paddw %[dest1], %[dest1], %[t1] \n\t" "paddw %[dest1], %[dest1], %[ph] \n\t" "psrlw %[dest1], %[dest1], %[shift1] \n\t" "packsswh %[dest], %[dest0], %[dest1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[s], %[s], 0x10 \n\t" "daddiu %[t], %[t], 0x10 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [s0] "=&f"(s0), [s1] "=&f"(s1), [t0] "=&f"(t0), [t1] "=&f"(t1), [s_hi] "=&f"(s_hi), [s_lo] "=&f"(s_lo), [t_hi] "=&f"(t_hi), [t_lo] "=&f"(t_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest] "=&f"(dest) : [s] "r"(s), [t] "r"(t), [dst_ptr] "r"(dst), [width] "r"(dst_width), [shift0] "f"(shift0), [shift1] "f"(shift1), [ph] "f"(ph), [mask] "f"(mask) : "memory"); } void ScaleRowDown4_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { (void)src_stride; uint64_t src0, src1; uint64_t dest, dest_hi, dest_lo; const uint64_t shift = 0x10ULL; const uint64_t mask = 0x000000ff000000ffULL; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "psrlw %[src0], %[src0], %[shift] \n\t" "and %[src0], %[src0], %[mask] \n\t" "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "psrlw %[src1], %[src1], %[shift] \n\t" "and %[src1], %[src1], %[mask] \n\t" "packsswh %[dest_lo], %[src0], %[src1] \n\t" "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" "psrlw %[src0], %[src0], %[shift] \n\t" "and %[src0], %[src0], %[mask] \n\t" "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" "psrlw %[src1], %[src1], %[shift] \n\t" "and %[src1], %[src1], %[mask] \n\t" "packsswh %[dest_hi], %[src0], %[src1] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), [shift] "f"(shift), [mask] "f"(mask) : "memory"); } void ScaleRowDown4_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { (void)src_stride; uint64_t src0, src1; uint64_t dest, dest_hi, dest_lo; const uint64_t mask = 0x0ULL; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "punpckhhw %[dest_lo], %[src0], %[src1] \n\t" "punpcklhw %[dest_lo], %[dest_lo], %[mask] \n\t" "gsldrc1 %[src0], 0x10(%[src_ptr]) \n\t" "gsldlc1 %[src0], 0x17(%[src_ptr]) \n\t" "gsldrc1 %[src1], 0x18(%[src_ptr]) \n\t" "gsldlc1 %[src1], 0x1f(%[src_ptr]) \n\t" "punpckhhw %[dest_hi], %[src0], %[src1] \n\t" "punpcklhw %[dest_hi], %[dest_hi], %[mask] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x20 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [dest] "=&f"(dest) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), [mask] "f"(mask) : "memory"); } #define DO_SCALEROWDOWN4BOX_PUNPCKADD() \ "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" #define DO_SCALEROWDOWN4BOX_LOOP(reg) \ "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ \ "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ DO_SCALEROWDOWN4BOX_PUNPCKADD() \ \ "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ DO_SCALEROWDOWN4BOX_PUNPCKADD() \ \ "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ DO_SCALEROWDOWN4BOX_PUNPCKADD() \ \ "pmaddhw %[dest_lo], %[dest_lo], %[mask1] \n\t" \ "pmaddhw %[dest_hi], %[dest_hi], %[mask1] \n\t" \ "packsswh " #reg ", %[dest_lo], %[dest_hi] \n\t" \ "pmaddhw " #reg ", " #reg ", %[mask1] \n\t" \ "paddh " #reg ", " #reg ", %[ph] \n\t" \ "psrlh " #reg ", " #reg ", %[shift] \n\t" \ \ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" /* LibYUVScaleTest.ScaleDownBy4_Box */ void ScaleRowDown4Box_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { const uint8_t* src0_ptr = src_ptr; const uint8_t* src1_ptr = src_ptr + src_stride; const uint8_t* src2_ptr = src_ptr + src_stride * 2; const uint8_t* src3_ptr = src_ptr + src_stride * 3; uint64_t src, src_hi, src_lo; uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; const uint64_t mask0 = 0x0ULL; const uint64_t mask1 = 0x0001000100010001ULL; const uint64_t ph = 0x0008000800080008ULL; const uint64_t shift = 0x4ULL; __asm__ volatile( "1: \n\t" DO_SCALEROWDOWN4BOX_LOOP(%[dest0]) DO_SCALEROWDOWN4BOX_LOOP(%[dest1]) DO_SCALEROWDOWN4BOX_LOOP(%[dest2]) DO_SCALEROWDOWN4BOX_LOOP(%[dest3]) "packsswh %[dest_lo], %[dest0], %[dest1] \n\t" "packsswh %[dest_hi], %[dest2], %[dest3] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), [ph] "f"(ph), [mask1] "f"(mask1) : "memory"); } #define DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ "punpcklbh %[src_lo], %[src], %[mask0] \n\t" \ "punpckhbh %[src_hi], %[src], %[mask0] \n\t" \ "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" \ "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" #define DO_SCALEROWDOWN4BOX_16_LOOP(reg) \ "ldc1 %[src], 0x00(%[src0_ptr]) \n\t" \ "punpcklbh %[dest_lo], %[src], %[mask0] \n\t" \ "punpckhbh %[dest_hi], %[src], %[mask0] \n\t" \ \ "ldc1 %[src], 0x00(%[src1_ptr]) \n\t" \ DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ \ "ldc1 %[src], 0x00(%[src2_ptr]) \n\t" \ DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ \ "ldc1 %[src], 0x00(%[src3_ptr]) \n\t" \ DO_SCALEROWDOWN4BOX_16_PUNPCKADD() \ \ "paddw %[dest], %[dest_lo], %[dest_hi] \n\t" \ "punpckhwd %[dest_hi], %[dest], %[dest] \n\t" \ "paddw %[dest], %[dest_hi], %[dest] \n\t" \ "paddw %[dest], %[dest], %[ph] \n\t" \ "psraw %[dest], %[dest], %[shift] \n\t" \ "and " #reg ", %[dest], %[mask1] \n\t" \ \ "daddiu %[src0_ptr], %[src0_ptr], 0x08 \n\t" \ "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" \ "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" \ "daddiu %[src3_ptr], %[src3_ptr], 0x08 \n\t" /* LibYUVScaleTest.ScaleDownBy4_Box_16 */ void ScaleRowDown4Box_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { const uint16_t* src0_ptr = src_ptr; const uint16_t* src1_ptr = src_ptr + src_stride; const uint16_t* src2_ptr = src_ptr + src_stride * 2; const uint16_t* src3_ptr = src_ptr + src_stride * 3; uint64_t src, src_hi, src_lo; uint64_t dest, dest_hi, dest_lo, dest0, dest1, dest2, dest3; const uint64_t mask0 = 0x0ULL; const uint64_t mask1 = 0x00000000ffffffffULL; const uint64_t ph = 0x0000000800000008ULL; const uint64_t shift = 0x04ULL; __asm__ volatile( "1: \n\t" DO_SCALEROWDOWN4BOX_16_LOOP(%[dest0]) DO_SCALEROWDOWN4BOX_16_LOOP(%[dest1]) DO_SCALEROWDOWN4BOX_16_LOOP(%[dest2]) DO_SCALEROWDOWN4BOX_16_LOOP(%[dest3]) "punpcklwd %[dest_lo], %[dest0], %[dest1] \n\t" "punpcklwd %[dest_hi], %[dest2], %[dest3] \n\t" "packushb %[dest], %[dest_lo], %[dest_hi] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [dest2] "=&f"(dest2), [dest3] "=&f"(dest3), [src] "=&f"(src), [dest] "=&f"(dest) : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), [src2_ptr] "r"(src2_ptr), [src3_ptr] "r"(src3_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), [shift] "f"(shift), [mask0] "f"(mask0), [ph] "f"(ph), [mask1] "f"(mask1) : "memory"); } // Scales a single row of pixels up by 2x using point sampling. void ScaleColsUp2_MMI(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) { uint64_t src, dest; (void)x; (void)dx; __asm__ volatile( "1: \n\t" "lwc1 %[src], 0x00(%[src_ptr]) \n\t" "punpcklbh %[dest], %[src], %[src] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x04 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [dest] "=&f"(dest) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) : "memory"); } void ScaleColsUp2_16_MMI(uint16_t* dst_ptr, const uint16_t* src_ptr, int dst_width, int x, int dx) { uint64_t src, dest; (void)x; (void)dx; __asm__ volatile( "1: \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "punpcklhw %[dest], %[src], %[src] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "punpckhhw %[dest], %[src], %[src] \n\t" "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src] "=&f"(src), [dest] "=&f"(dest) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(dst_width) : "memory"); } void ScaleAddRow_MMI(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { uint64_t src, src_hi, src_lo, dest0, dest1; const uint64_t mask = 0x0ULL; __asm__ volatile( "1: \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "punpcklbh %[src_lo], %[src], %[mask] \n\t" "punpckhbh %[src_hi], %[src], %[mask] \n\t" "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" "paddush %[dest0], %[dest0], %[src_lo] \n\t" "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" "paddush %[dest1], %[dest1], %[src_hi] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [src] "=&f"(src) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), [mask] "f"(mask) : "memory"); } void ScaleAddRow_16_MMI(const uint16_t* src_ptr, uint32_t* dst_ptr, int src_width) { uint64_t src, src_hi, src_lo, dest0, dest1; const uint64_t mask = 0x0ULL; __asm__ volatile( "1: \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "punpcklhw %[src_lo], %[src], %[mask] \n\t" "punpckhhw %[src_hi], %[src], %[mask] \n\t" "gsldrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" "gsldlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" "paddw %[dest0], %[dest0], %[src_lo] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" "gsldrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" "gsldlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" "paddw %[dest1], %[dest1], %[src_hi] \n\t" "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [src] "=&f"(src) : [src_ptr] "r"(src_ptr), [dst_ptr] "r"(dst_ptr), [width] "r"(src_width), [mask] "f"(mask) : "memory"); } void ScaleARGBRowDownEven_MMI(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { (void)src_stride; uint64_t src0, src1, dest; __asm__ volatile( "1: \n\t" "lwc1 %[src0], 0x00(%[src_ptr]) \n\t" "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" "lwc1 %[src1], 0x00(%[src_ptr]) \n\t" "punpcklwd %[dest], %[src0], %[src1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "dadd %[src_ptr], %[src_ptr], %[src_stepx_4]\n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [src_stepx_4] "r"(src_stepx << 2), [width] "r"(dst_width) : "memory"); } void ScaleARGBRowDownEvenBox_MMI(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { const uint8_t* src0_ptr = src_argb; const uint8_t* src1_ptr = src_argb + src_stride; uint64_t src0, src1, src_hi, src_lo; uint64_t dest, dest_hi, dest_lo, dest0, dest1; const uint64_t mask = 0x0ULL; const uint64_t ph = 0x0002000200020002ULL; const uint64_t shift = 0x2ULL; __asm__ volatile( "1: \n\t" "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" "punpcklbh %[src_lo], %[src1], %[mask] \n\t" "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" "punpcklbh %[src_hi], %[src1], %[mask] \n\t" "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" "paddh %[dest0], %[dest_hi], %[dest_lo] \n\t" "paddh %[dest0], %[dest0], %[ph] \n\t" "psrlh %[dest0], %[dest0], %[shift] \n\t" "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" "lwc1 %[src0], 0x00(%[src0_ptr]) \n\t" "punpcklbh %[dest_lo], %[src0], %[mask] \n\t" "lwc1 %[src0], 0x04(%[src0_ptr]) \n\t" "punpcklbh %[dest_hi], %[src0], %[mask] \n\t" "lwc1 %[src1], 0x00(%[src1_ptr]) \n\t" "punpcklbh %[src_lo], %[src1], %[mask] \n\t" "lwc1 %[src1], 0x04(%[src1_ptr]) \n\t" "punpcklbh %[src_hi], %[src1], %[mask] \n\t" "paddh %[dest_lo], %[dest_lo], %[src_lo] \n\t" "paddh %[dest_hi], %[dest_hi], %[src_hi] \n\t" "paddh %[dest1], %[dest_hi], %[dest_lo] \n\t" "paddh %[dest1], %[dest1], %[ph] \n\t" "psrlh %[dest1], %[dest1], %[shift] \n\t" "packushb %[dest], %[dest0], %[dest1] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" "dadd %[src0_ptr], %[src0_ptr], %[src_stepx_4] \n\t" "dadd %[src1_ptr], %[src1_ptr], %[src_stepx_4] \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x08 \n\t" "daddi %[width], %[width], -0x02 \n\t" "bnez %[width], 1b \n\t" : [src_hi] "=&f"(src_hi), [src_lo] "=&f"(src_lo), [dest_hi] "=&f"(dest_hi), [dest_lo] "=&f"(dest_lo), [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src0] "=&f"(src0), [src1] "=&f"(src1), [dest] "=&f"(dest) : [src0_ptr] "r"(src0_ptr), [src1_ptr] "r"(src1_ptr), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width), [src_stepx_4] "r"(src_stepx << 2), [shift] "f"(shift), [mask] "f"(mask), [ph] "f"(ph) : "memory"); } // Scales a single row of pixels using point sampling. void ScaleARGBCols_MMI(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { const uint32_t* src = (const uint32_t*)(src_argb); uint32_t* dst = (uint32_t*)(dst_argb); const uint32_t* src_tmp; uint64_t dest, offset; const uint64_t shift0 = 16; const uint64_t shift1 = 2; __asm__ volatile( "1: \n\t" "srav %[offset], %[x], %[shift0] \n\t" "sllv %[offset], %[offset], %[shift1] \n\t" "dadd %[src_tmp], %[src_ptr], %[offset] \n\t" "lwc1 %[dest], 0x00(%[src_tmp]) \n\t" "swc1 %[dest], 0x00(%[dst_ptr]) \n\t" "dadd %[x], %[x], %[dx] \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x04 \n\t" "daddi %[width], %[width], -0x01 \n\t" "bnez %[width], 1b \n\t" : [dest] "=&f"(dest), [offset] "=&r"(offset), [src_tmp] "=&r"(src_tmp) : [src_ptr] "r"(src), [dst_ptr] "r"(dst), [width] "r"(dst_width), [dx] "r"(dx), [x] "r"(x), [shift0] "r"(shift0), [shift1] "r"(shift1) : "memory"); } // Scales a single row of pixels up by 2x using point sampling. void ScaleARGBColsUp2_MMI(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { uint64_t src, dest0, dest1; (void)x; (void)dx; __asm__ volatile( "1: \n\t" "gsldrc1 %[src], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[src], 0x07(%[src_ptr]) \n\t" "punpcklwd %[dest0], %[src], %[src] \n\t" "gssdlc1 %[dest0], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest0], 0x00(%[dst_ptr]) \n\t" "punpckhwd %[dest1], %[src], %[src] \n\t" "gssdlc1 %[dest1], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[dest1], 0x08(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x04 \n\t" "bnez %[width], 1b \n\t" : [dest0] "=&f"(dest0), [dest1] "=&f"(dest1), [src] "=&f"(src) : [src_ptr] "r"(src_argb), [dst_ptr] "r"(dst_argb), [width] "r"(dst_width) : "memory"); } // Divide num by div and return as 16.16 fixed point result. /* LibYUVBaseTest.TestFixedDiv */ int FixedDiv_MIPS(int num, int div) { int quotient = 0; const int shift = 16; asm( "dsll %[num], %[num], %[shift] \n\t" "ddiv %[num], %[div] \t\n" "mflo %[quo] \t\n" : [quo] "+&r"(quotient) : [num] "r"(num), [div] "r"(div), [shift] "r"(shift)); return quotient; } // Divide num by div and return as 16.16 fixed point result. /* LibYUVScaleTest.ARGBScaleTo320x240_Linear */ int FixedDiv1_MIPS(int num, int div) { int quotient = 0; const int shift = 16; const int val1 = 1; const int64_t val11 = 0x00010001ULL; asm( "dsll %[num], %[num], %[shift] \n\t" "dsub %[num], %[num], %[val11] \n\t" "dsub %[div], %[div], %[val1] \n\t" "ddiv %[num], %[div] \t\n" "mflo %[quo] \t\n" : [quo] "+&r"(quotient) : [num] "r"(num), [div] "r"(div), [val1] "r"(val1), [val11] "r"(val11), [shift] "r"(shift)); return quotient; } // Read 8x2 upsample with filtering and write 16x1. // actually reads an extra pixel, so 9x2. void ScaleRowUp2_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { const uint16_t* src2_ptr = src_ptr + src_stride; uint64_t src0, src1; uint64_t dest, dest04, dest15, dest26, dest37; uint64_t tmp0, tmp1, tmp2, tmp3; const uint64_t mask0 = 0x0003000900030009ULL; const uint64_t mask1 = 0x0001000300010003ULL; const uint64_t mask2 = 0x0009000300090003ULL; const uint64_t mask3 = 0x0003000100030001ULL; const uint64_t ph = 0x0000000800000008ULL; const uint64_t shift = 4; __asm__ volatile( "1: \n\t" "gsldrc1 %[src0], 0x00(%[src1_ptr]) \n\t" "gsldlc1 %[src0], 0x07(%[src1_ptr]) \n\t" "pmaddhw %[dest04], %[src0], %[mask0] \n\t" "gsldrc1 %[src1], 0x00(%[src2_ptr]) \n\t" "gsldlc1 %[src1], 0x07(%[src2_ptr]) \n\t" "pmaddhw %[dest], %[src1], %[mask1] \n\t" "paddw %[dest04], %[dest04], %[dest] \n\t" "paddw %[dest04], %[dest04], %[ph] \n\t" "psrlw %[dest04], %[dest04], %[shift] \n\t" "pmaddhw %[dest15], %[src0], %[mask2] \n\t" "pmaddhw %[dest], %[src1], %[mask3] \n\t" "paddw %[dest15], %[dest15], %[dest] \n\t" "paddw %[dest15], %[dest15], %[ph] \n\t" "psrlw %[dest15], %[dest15], %[shift] \n\t" "gsldrc1 %[src0], 0x02(%[src1_ptr]) \n\t" "gsldlc1 %[src0], 0x09(%[src1_ptr]) \n\t" "pmaddhw %[dest26], %[src0], %[mask0] \n\t" "gsldrc1 %[src1], 0x02(%[src2_ptr]) \n\t" "gsldlc1 %[src1], 0x09(%[src2_ptr]) \n\t" "pmaddhw %[dest], %[src1], %[mask1] \n\t" "paddw %[dest26], %[dest26], %[dest] \n\t" "paddw %[dest26], %[dest26], %[ph] \n\t" "psrlw %[dest26], %[dest26], %[shift] \n\t" "pmaddhw %[dest37], %[src0], %[mask2] \n\t" "pmaddhw %[dest], %[src1], %[mask3] \n\t" "paddw %[dest37], %[dest37], %[dest] \n\t" "paddw %[dest37], %[dest37], %[ph] \n\t" "psrlw %[dest37], %[dest37], %[shift] \n\t" /* tmp0 = ( 00 04 02 06 ) */ "packsswh %[tmp0], %[dest04], %[dest26] \n\t" /* tmp1 = ( 01 05 03 07 ) */ "packsswh %[tmp1], %[dest15], %[dest37] \n\t" /* tmp2 = ( 00 01 04 05 )*/ "punpcklhw %[tmp2], %[tmp0], %[tmp1] \n\t" /* tmp3 = ( 02 03 06 07 )*/ "punpckhhw %[tmp3], %[tmp0], %[tmp1] \n\t" /* ( 00 01 02 03 ) */ "punpcklwd %[dest], %[tmp2], %[tmp3] \n\t" "gssdlc1 %[dest], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x00(%[dst_ptr]) \n\t" /* ( 04 05 06 07 ) */ "punpckhwd %[dest], %[tmp2], %[tmp3] \n\t" "gssdlc1 %[dest], 0x0f(%[dst_ptr]) \n\t" "gssdrc1 %[dest], 0x08(%[dst_ptr]) \n\t" "daddiu %[src1_ptr], %[src1_ptr], 0x08 \n\t" "daddiu %[src2_ptr], %[src2_ptr], 0x08 \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x08 \n\t" "bnez %[width], 1b \n\t" : [src0] "=&f"(src0), [src1] "=&f"(src1), [dest04] "=&f"(dest04), [dest15] "=&f"(dest15), [dest26] "=&f"(dest26), [dest37] "=&f"(dest37), [tmp0] "=&f"(tmp0), [tmp1] "=&f"(tmp1), [tmp2] "=&f"(tmp2), [tmp3] "=&f"(tmp3), [dest] "=&f"(dest) : [src1_ptr] "r"(src_ptr), [src2_ptr] "r"(src2_ptr), [dst_ptr] "r"(dst), [width] "r"(dst_width), [mask0] "f"(mask0), [mask1] "f"(mask1), [mask2] "f"(mask2), [mask3] "f"(mask3), [shift] "f"(shift), [ph] "f"(ph) : "memory"); } void ScaleRowDown34_MMI(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { (void)src_stride; assert((dst_width % 3 == 0) && (dst_width > 0)); uint64_t src[2]; uint64_t tmp[2]; __asm__ volatile ( "1: \n\t" "gsldlc1 %[src0], 0x07(%[src_ptr]) \n\t" "gsldrc1 %[src0], 0x00(%[src_ptr]) \n\t" "gsldlc1 %[src1], 0x0f(%[src_ptr]) \n\t" "gsldrc1 %[src1], 0x08(%[src_ptr]) \n\t" "and %[tmp1], %[src0], %[mask1] \n\t" "psrlw %[tmp0], %[src0], %[rmov] \n\t" "psllw %[tmp0], %[tmp0], %[lmov1] \n\t" "or %[src0], %[tmp0], %[tmp1] \n\t" "punpckhwd %[tmp0], %[src0], %[src0] \n\t" "psllw %[tmp1], %[tmp0], %[rmov] \n\t" "or %[src0], %[src0], %[tmp1] \n\t" "psrlw %[tmp0], %[tmp0], %[rmov8] \n\t" "pextrh %[tmp0], %[tmp0], %[zero] \n\t" "pinsrh_2 %[src0], %[src0], %[tmp0] \n\t" "pextrh %[tmp0], %[src1], %[zero] \n\t" "pinsrh_3 %[src0], %[src0], %[tmp0] \n\t" "punpckhwd %[tmp0], %[src1], %[src1] \n\t" "pextrh %[tmp1], %[tmp0], %[zero] \n\t" "psrlw %[src1], %[src1], %[rmov] \n\t" "psllw %[tmp1], %[tmp1], %[rmov8] \n\t" "or %[src1], %[src1], %[tmp1] \n\t" "and %[tmp0], %[tmp0], %[mask2] \n\t" "or %[src1], %[src1], %[tmp0] \n\t" "gssdlc1 %[src0], 0x07(%[dst_ptr]) \n\t" "gssdrc1 %[src0], 0x00(%[dst_ptr]) \n\t" "gsswlc1 %[src1], 0x0b(%[dst_ptr]) \n\t" "gsswrc1 %[src1], 0x08(%[dst_ptr]) \n\t" "daddiu %[src_ptr], %[src_ptr], 0x10 \n\t" "daddi %[width], %[width], -0x0c \n\t" "daddiu %[dst_ptr], %[dst_ptr], 0x0c \n\t" "bnez %[width], 1b \n\t" : [src0]"=&f"(src[0]), [src1]"=&f"(src[1]), [tmp0]"=&f"(tmp[0]), [tmp1]"=&f"(tmp[1]) : [src_ptr]"r"(src_ptr), [dst_ptr]"r"(dst), [lmov]"f"(0xc), [rmov]"f"(0x18), [mask1]"f"(0xffff0000ffff), [rmov8]"f"(0x8), [zero]"f"(0x0), [mask2]"f"(0xff000000), [width]"r"(dst_width), [lmov1]"f"(0x10) : "memory" ); } // clang-format on #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/scale_msa.cc000066400000000000000000001035611416500237200212720ustar00rootroot00000000000000/* * Copyright 2016 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include "libyuv/scale_row.h" // This module is for GCC MSA #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #include "libyuv/macros_msa.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif #define LOAD_INDEXED_DATA(srcp, indx0, out0) \ { \ out0[0] = srcp[indx0[0]]; \ out0[1] = srcp[indx0[1]]; \ out0[2] = srcp[indx0[2]]; \ out0[3] = srcp[indx0[3]]; \ } void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { int x; v16u8 src0, src1, dst0; (void)src_stride; for (x = 0; x < dst_width; x += 4) { src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); ST_UB(dst0, dst_argb); src_argb += 32; dst_argb += 16; } } void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { int x; v16u8 src0, src1, vec0, vec1, dst0; (void)src_stride; for (x = 0; x < dst_width; x += 4) { src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1); ST_UB(dst0, dst_argb); src_argb += 32; dst_argb += 16; } } void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { int x; const uint8_t* s = src_argb; const uint8_t* t = src_argb + src_stride; v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; v8u16 reg0, reg1, reg2, reg3; v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15}; for (x = 0; x < dst_width; x += 4) { src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0); vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2); vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3); reg0 = __msa_hadd_u_h(vec0, vec0); reg1 = __msa_hadd_u_h(vec1, vec1); reg2 = __msa_hadd_u_h(vec2, vec2); reg3 = __msa_hadd_u_h(vec3, vec3); reg0 += reg2; reg1 += reg3; reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2); reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2); dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); ST_UB(dst0, dst_argb); s += 32; t += 32; dst_argb += 16; } } void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, int32_t src_stepx, uint8_t* dst_argb, int dst_width) { int x; int32_t stepx = src_stepx * 4; int32_t data0, data1, data2, data3; (void)src_stride; for (x = 0; x < dst_width; x += 4) { data0 = LW(src_argb); data1 = LW(src_argb + stepx); data2 = LW(src_argb + stepx * 2); data3 = LW(src_argb + stepx * 3); SW(data0, dst_argb); SW(data1, dst_argb + 4); SW(data2, dst_argb + 8); SW(data3, dst_argb + 12); src_argb += stepx * 4; dst_argb += 16; } } void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { int x; const uint8_t* nxt_argb = src_argb + src_stride; int32_t stepx = src_stepx * 4; int64_t data0, data1, data2, data3; v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; v16u8 vec0, vec1, vec2, vec3; v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; v16u8 dst0; for (x = 0; x < dst_width; x += 4) { data0 = LD(src_argb); data1 = LD(src_argb + stepx); data2 = LD(src_argb + stepx * 2); data3 = LD(src_argb + stepx * 3); src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0); src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1); src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2); src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3); data0 = LD(nxt_argb); data1 = LD(nxt_argb + stepx); data2 = LD(nxt_argb + stepx * 2); data3 = LD(nxt_argb + stepx * 3); src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0); src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1); src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2); src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3); vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); reg0 = __msa_hadd_u_h(vec0, vec0); reg1 = __msa_hadd_u_h(vec1, vec1); reg2 = __msa_hadd_u_h(vec2, vec2); reg3 = __msa_hadd_u_h(vec3, vec3); reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0); reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1); reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0); reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1); reg4 += reg6; reg5 += reg7; reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); ST_UB(dst0, dst_argb); src_argb += stepx * 4; nxt_argb += stepx * 4; dst_argb += 16; } } void ScaleRowDown2_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { int x; v16u8 src0, src1, src2, src3, dst0, dst1; (void)src_stride; for (x = 0; x < dst_width; x += 32) { src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); ST_UB2(dst0, dst1, dst, 16); src_ptr += 64; dst += 32; } } void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { int x; v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1; (void)src_stride; for (x = 0; x < dst_width; x += 32) { src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); dst0 = __msa_aver_u_b(vec1, vec0); dst1 = __msa_aver_u_b(vec3, vec2); ST_UB2(dst0, dst1, dst, 16); src_ptr += 64; dst += 32; } } void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { int x; const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1; v8u16 vec0, vec1, vec2, vec3; for (x = 0; x < dst_width; x += 32) { src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); vec0 = __msa_hadd_u_h(src0, src0); vec1 = __msa_hadd_u_h(src1, src1); vec2 = __msa_hadd_u_h(src2, src2); vec3 = __msa_hadd_u_h(src3, src3); vec0 += __msa_hadd_u_h(src4, src4); vec1 += __msa_hadd_u_h(src5, src5); vec2 += __msa_hadd_u_h(src6, src6); vec3 += __msa_hadd_u_h(src7, src7); vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2); vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2); vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2); vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); ST_UB2(dst0, dst1, dst, 16); s += 64; t += 64; dst += 32; } } void ScaleRowDown4_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { int x; v16u8 src0, src1, src2, src3, vec0, vec1, dst0; (void)src_stride; for (x = 0; x < dst_width; x += 16) { src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst); src_ptr += 64; dst += 16; } } void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { int x; const uint8_t* s = src_ptr; const uint8_t* t0 = s + src_stride; const uint8_t* t1 = s + src_stride * 2; const uint8_t* t2 = s + src_stride * 3; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0; v8u16 vec0, vec1, vec2, vec3; v4u32 reg0, reg1, reg2, reg3; for (x = 0; x < dst_width; x += 16) { src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0); src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16); src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32); src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48); vec0 = __msa_hadd_u_h(src0, src0); vec1 = __msa_hadd_u_h(src1, src1); vec2 = __msa_hadd_u_h(src2, src2); vec3 = __msa_hadd_u_h(src3, src3); vec0 += __msa_hadd_u_h(src4, src4); vec1 += __msa_hadd_u_h(src5, src5); vec2 += __msa_hadd_u_h(src6, src6); vec3 += __msa_hadd_u_h(src7, src7); src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0); src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16); src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32); src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48); src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0); src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16); src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32); src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48); vec0 += __msa_hadd_u_h(src0, src0); vec1 += __msa_hadd_u_h(src1, src1); vec2 += __msa_hadd_u_h(src2, src2); vec3 += __msa_hadd_u_h(src3, src3); vec0 += __msa_hadd_u_h(src4, src4); vec1 += __msa_hadd_u_h(src5, src5); vec2 += __msa_hadd_u_h(src6, src6); vec3 += __msa_hadd_u_h(src7, src7); reg0 = __msa_hadd_u_w(vec0, vec0); reg1 = __msa_hadd_u_w(vec1, vec1); reg2 = __msa_hadd_u_w(vec2, vec2); reg3 = __msa_hadd_u_w(vec3, vec3); reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4); reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4); reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4); reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4); vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); ST_UB(dst0, dst); s += 64; t0 += 64; t1 += 64; t2 += 64; dst += 16; } } void ScaleRowDown38_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { int x, width; uint64_t dst0; uint32_t dst1; v16u8 src0, src1, vec0; v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; (void)src_stride; assert(dst_width % 3 == 0); width = dst_width / 3; for (x = 0; x < width; x += 4) { src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0); dst0 = __msa_copy_u_d((v2i64)vec0, 0); dst1 = __msa_copy_u_w((v4i32)vec0, 2); SD(dst0, dst); SW(dst1, dst + 8); src_ptr += 32; dst += 12; } } void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { int x, width; const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; uint64_t dst0; uint32_t dst1; v16u8 src0, src1, src2, src3, out; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; v8i16 zero = {0}; v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000); assert((dst_width % 3 == 0) && (dst_width > 0)); width = dst_width / 3; for (x = 0; x < width; x += 4) { src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0); vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1); vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2); vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3); vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); tmp0 = __msa_hadd_u_w(vec4, vec4); tmp1 = __msa_hadd_u_w(vec5, vec5); tmp2 = __msa_hadd_u_w(vec6, vec6); tmp3 = __msa_hadd_u_w(vec7, vec7); tmp4 = __msa_hadd_u_w(vec0, vec0); vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); tmp0 = __msa_hadd_u_w(vec0, vec0); tmp1 = __msa_hadd_u_w(vec1, vec1); tmp0 *= const_0x2AAA; tmp1 *= const_0x2AAA; tmp4 *= const_0x4000; tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); dst0 = __msa_copy_u_d((v2i64)out, 0); dst1 = __msa_copy_u_w((v4i32)out, 2); SD(dst0, dst_ptr); SW(dst1, dst_ptr + 8); s += 32; t += 32; dst_ptr += 12; } } void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { int x, width; const uint8_t* s = src_ptr; const uint8_t* t0 = s + src_stride; const uint8_t* t1 = s + src_stride * 2; uint64_t dst0; uint32_t dst1; v16u8 src0, src1, src2, src3, src4, src5, out; v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; v8u16 zero = {0}; v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71); v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); assert((dst_width % 3 == 0) && (dst_width > 0)); width = dst_width / 3; for (x = 0; x < width; x += 4) { src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0); src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16); src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0); src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16); vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4); vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4); vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5); vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5); vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0); vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1); vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2); vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3); vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); tmp0 = __msa_hadd_u_w(vec4, vec4); tmp1 = __msa_hadd_u_w(vec5, vec5); tmp2 = __msa_hadd_u_w(vec6, vec6); tmp3 = __msa_hadd_u_w(vec7, vec7); tmp4 = __msa_hadd_u_w(vec0, vec0); vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); tmp0 = __msa_hadd_u_w(vec0, vec0); tmp1 = __msa_hadd_u_w(vec1, vec1); tmp0 *= const_0x1C71; tmp1 *= const_0x1C71; tmp4 *= const_0x2AAA; tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); dst0 = __msa_copy_u_d((v2i64)out, 0); dst1 = __msa_copy_u_w((v4i32)out, 2); SD(dst0, dst_ptr); SW(dst1, dst_ptr + 8); s += 32; t0 += 32; t1 += 32; dst_ptr += 12; } } void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { int x; v16u8 src0; v8u16 dst0, dst1; v16i8 zero = {0}; assert(src_width > 0); for (x = 0; x < src_width; x += 16) { src0 = LD_UB(src_ptr); dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0); dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16); dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0); dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0); ST_UH2(dst0, dst1, dst_ptr, 8); src_ptr += 16; dst_ptr += 16; } } void ScaleFilterCols_MSA(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) { int j; v4i32 vec_x = __msa_fill_w(x); v4i32 vec_dx = __msa_fill_w(dx); v4i32 vec_const = {0, 1, 2, 3}; v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8u16 reg0, reg1; v16u8 dst0; v4i32 const_0xFFFF = __msa_fill_w(0xFFFF); v4i32 const_0x40 = __msa_fill_w(0x40); vec0 = vec_dx * vec_const; vec1 = vec_dx * 4; vec_x += vec0; for (j = 0; j < dst_width - 1; j += 16) { vec2 = vec_x >> 16; vec6 = vec_x & const_0xFFFF; vec_x += vec1; vec3 = vec_x >> 16; vec7 = vec_x & const_0xFFFF; vec_x += vec1; vec4 = vec_x >> 16; vec8 = vec_x & const_0xFFFF; vec_x += vec1; vec5 = vec_x >> 16; vec9 = vec_x & const_0xFFFF; vec_x += vec1; vec6 >>= 9; vec7 >>= 9; vec8 >>= 9; vec9 >>= 9; LOAD_INDEXED_DATA(src_ptr, vec2, tmp0); LOAD_INDEXED_DATA(src_ptr, vec3, tmp1); LOAD_INDEXED_DATA(src_ptr, vec4, tmp2); LOAD_INDEXED_DATA(src_ptr, vec5, tmp3); vec2 += 1; vec3 += 1; vec4 += 1; vec5 += 1; LOAD_INDEXED_DATA(src_ptr, vec2, tmp4); LOAD_INDEXED_DATA(src_ptr, vec3, tmp5); LOAD_INDEXED_DATA(src_ptr, vec4, tmp6); LOAD_INDEXED_DATA(src_ptr, vec5, tmp7); tmp4 -= tmp0; tmp5 -= tmp1; tmp6 -= tmp2; tmp7 -= tmp3; tmp4 *= vec6; tmp5 *= vec7; tmp6 *= vec8; tmp7 *= vec9; tmp4 += const_0x40; tmp5 += const_0x40; tmp6 += const_0x40; tmp7 += const_0x40; tmp4 >>= 7; tmp5 >>= 7; tmp6 >>= 7; tmp7 >>= 7; tmp0 += tmp4; tmp1 += tmp5; tmp2 += tmp6; tmp3 += tmp7; reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); __msa_st_b(dst0, dst_ptr, 0); dst_ptr += 16; } } void ScaleARGBCols_MSA(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { const uint32_t* src = (const uint32_t*)(src_argb); uint32_t* dst = (uint32_t*)(dst_argb); int j; v4i32 x_vec = __msa_fill_w(x); v4i32 dx_vec = __msa_fill_w(dx); v4i32 const_vec = {0, 1, 2, 3}; v4i32 vec0, vec1, vec2; v4i32 dst0; vec0 = dx_vec * const_vec; vec1 = dx_vec * 4; x_vec += vec0; for (j = 0; j < dst_width; j += 4) { vec2 = x_vec >> 16; x_vec += vec1; LOAD_INDEXED_DATA(src, vec2, dst0); __msa_st_w(dst0, dst, 0); dst += 4; } } void ScaleARGBFilterCols_MSA(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { const uint32_t* src = (const uint32_t*)(src_argb); int j; v4u32 src0, src1, src2, src3; v4u32 vec0, vec1, vec2, vec3; v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; v16u8 mult0, mult1, mult2, mult3; v8u16 tmp0, tmp1, tmp2, tmp3; v16u8 dst0, dst1; v4u32 vec_x = (v4u32)__msa_fill_w(x); v4u32 vec_dx = (v4u32)__msa_fill_w(dx); v4u32 vec_const = {0, 1, 2, 3}; v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f); vec0 = vec_dx * vec_const; vec1 = vec_dx * 4; vec_x += vec0; for (j = 0; j < dst_width - 1; j += 8) { vec2 = vec_x >> 16; reg0 = (v16u8)(vec_x >> 9); vec_x += vec1; vec3 = vec_x >> 16; reg1 = (v16u8)(vec_x >> 9); vec_x += vec1; reg0 = reg0 & const_0x7f; reg1 = reg1 & const_0x7f; reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0); reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0); reg2 = reg0 ^ const_0x7f; reg3 = reg1 ^ const_0x7f; mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2); mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2); mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3); mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3); LOAD_INDEXED_DATA(src, vec2, src0); LOAD_INDEXED_DATA(src, vec3, src1); vec2 += 1; vec3 += 1; LOAD_INDEXED_DATA(src, vec2, src2); LOAD_INDEXED_DATA(src, vec3, src3); reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); tmp0 = __msa_dotp_u_h(reg4, mult0); tmp1 = __msa_dotp_u_h(reg5, mult1); tmp2 = __msa_dotp_u_h(reg6, mult2); tmp3 = __msa_dotp_u_h(reg7, mult3); tmp0 >>= 7; tmp1 >>= 7; tmp2 >>= 7; tmp3 >>= 7; dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0); dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2); __msa_st_b(dst0, dst_argb, 0); __msa_st_b(dst1, dst_argb, 16); dst_argb += 32; } } void ScaleRowDown34_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { int x; (void)src_stride; v16u8 src0, src1, src2, src3; v16u8 vec0, vec1, vec2; v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20}; v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25}; v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25, 27, 28, 29, 31}; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 48) { src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0); vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1); vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2); __msa_st_b((v16i8)vec0, dst, 0); __msa_st_b((v16i8)vec1, dst, 16); __msa_st_b((v16i8)vec2, dst, 32); src_ptr += 64; dst += 48; } } void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* d, int dst_width) { const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; int x; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; v16u8 vec0, vec1, vec2, vec3, vec4, vec5; v16u8 vec6, vec7, vec8, vec9, vec10, vec11; v8i16 reg0, reg1, reg2, reg3, reg4, reg5; v8i16 reg6, reg7, reg8, reg9, reg10, reg11; v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, 16, 17, 17, 18, 18, 19, 20, 21}; v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 48) { src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); reg0 = __msa_srar_h(reg0, shft0); reg1 = __msa_srar_h(reg1, shft1); reg2 = __msa_srar_h(reg2, shft2); reg3 = __msa_srar_h(reg3, shft0); reg4 = __msa_srar_h(reg4, shft1); reg5 = __msa_srar_h(reg5, shft2); reg6 = __msa_srar_h(reg6, shft0); reg7 = __msa_srar_h(reg7, shft1); reg8 = __msa_srar_h(reg8, shft2); reg9 = __msa_srar_h(reg9, shft0); reg10 = __msa_srar_h(reg10, shft1); reg11 = __msa_srar_h(reg11, shft2); reg0 = reg0 * 3 + reg6; reg1 = reg1 * 3 + reg7; reg2 = reg2 * 3 + reg8; reg3 = reg3 * 3 + reg9; reg4 = reg4 * 3 + reg10; reg5 = reg5 * 3 + reg11; reg0 = __msa_srari_h(reg0, 2); reg1 = __msa_srari_h(reg1, 2); reg2 = __msa_srari_h(reg2, 2); reg3 = __msa_srari_h(reg3, 2); reg4 = __msa_srari_h(reg4, 2); reg5 = __msa_srari_h(reg5, 2); dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); __msa_st_b((v16i8)dst0, d, 0); __msa_st_b((v16i8)dst1, d, 16); __msa_st_b((v16i8)dst2, d, 32); s += 64; t += 64; d += 48; } } void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* d, int dst_width) { const uint8_t* s = src_ptr; const uint8_t* t = src_ptr + src_stride; int x; v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2; v16u8 vec0, vec1, vec2, vec3, vec4, vec5; v16u8 vec6, vec7, vec8, vec9, vec10, vec11; v8i16 reg0, reg1, reg2, reg3, reg4, reg5; v8i16 reg6, reg7, reg8, reg9, reg10, reg11; v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1}; v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1}; v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3}; v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15, 16, 17, 17, 18, 18, 19, 20, 21}; v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1}; v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2}; v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2}; assert((dst_width % 3 == 0) && (dst_width > 0)); for (x = 0; x < dst_width; x += 48) { src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0); vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1); vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2); vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2); vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3); vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4); vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4); vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5); vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6); vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6); vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7); reg0 = (v8i16)__msa_dotp_u_h(vec0, const0); reg1 = (v8i16)__msa_dotp_u_h(vec1, const1); reg2 = (v8i16)__msa_dotp_u_h(vec2, const2); reg3 = (v8i16)__msa_dotp_u_h(vec3, const0); reg4 = (v8i16)__msa_dotp_u_h(vec4, const1); reg5 = (v8i16)__msa_dotp_u_h(vec5, const2); reg6 = (v8i16)__msa_dotp_u_h(vec6, const0); reg7 = (v8i16)__msa_dotp_u_h(vec7, const1); reg8 = (v8i16)__msa_dotp_u_h(vec8, const2); reg9 = (v8i16)__msa_dotp_u_h(vec9, const0); reg10 = (v8i16)__msa_dotp_u_h(vec10, const1); reg11 = (v8i16)__msa_dotp_u_h(vec11, const2); reg0 = __msa_srar_h(reg0, shft0); reg1 = __msa_srar_h(reg1, shft1); reg2 = __msa_srar_h(reg2, shft2); reg3 = __msa_srar_h(reg3, shft0); reg4 = __msa_srar_h(reg4, shft1); reg5 = __msa_srar_h(reg5, shft2); reg6 = __msa_srar_h(reg6, shft0); reg7 = __msa_srar_h(reg7, shft1); reg8 = __msa_srar_h(reg8, shft2); reg9 = __msa_srar_h(reg9, shft0); reg10 = __msa_srar_h(reg10, shft1); reg11 = __msa_srar_h(reg11, shft2); reg0 += reg6; reg1 += reg7; reg2 += reg8; reg3 += reg9; reg4 += reg10; reg5 += reg11; reg0 = __msa_srari_h(reg0, 1); reg1 = __msa_srari_h(reg1, 1); reg2 = __msa_srari_h(reg2, 1); reg3 = __msa_srari_h(reg3, 1); reg4 = __msa_srari_h(reg4, 1); reg5 = __msa_srari_h(reg5, 1); dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); __msa_st_b((v16i8)dst0, d, 0); __msa_st_b((v16i8)dst1, d, 16); __msa_st_b((v16i8)dst2, d, 32); s += 64; t += 64; d += 48; } } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) libyuv-0.0~git20220104.b91df1a/source/scale_neon.cc000066400000000000000000001732441416500237200214560ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for GCC Neon. #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) // NEON downscalers with interpolation. // Provided by Fritz Koenig // Read 32x1 throw away even pixels, and write 16x1. void ScaleRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { (void)src_stride; asm volatile( "1: \n" // load even pixels into q0, odd into q1 "vld2.8 {q0, q1}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop "vst1.8 {q1}, [%1]! \n" // store odd pixels "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 : : "q0", "q1" // Clobber List ); } // Read 32x1 average down and write 16x1. void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { (void)src_stride; asm volatile( "1: \n" "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels "subs %2, %2, #16 \n" // 16 processed per loop "vrhadd.u8 q0, q0, q1 \n" // rounding half add "vst1.8 {q0}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 : : "q0", "q1" // Clobber List ); } // Read 32x2 average down and write 16x1. void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { asm volatile( // change the stride to row 2 pointer "add %1, %0 \n" "1: \n" "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc "subs %3, %3, #16 \n" // 16 processed per loop "vpaddl.u8 q0, q0 \n" // row 1 add adjacent "vpaddl.u8 q1, q1 \n" "vpadal.u8 q0, q2 \n" // row 2 add adjacent + // row1 "vpadal.u8 q1, q3 \n" "vrshrn.u16 d0, q0, #2 \n" // downshift, round and // pack "vrshrn.u16 d1, q1, #2 \n" "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 "+r"(dst_width) // %3 : : "q0", "q1", "q2", "q3" // Clobber List ); } void ScaleRowDown4_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #8 \n" // 8 processed per loop "vst1.8 {d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : : "q0", "q1", "memory", "cc"); } void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr3 = src_ptr + src_stride * 3; asm volatile( "1: \n" "vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q1}, [%3]! \n" "vld1.8 {q2}, [%4]! \n" "vld1.8 {q3}, [%5]! \n" "subs %2, %2, #4 \n" "vpaddl.u8 q0, q0 \n" "vpadal.u8 q0, q1 \n" "vpadal.u8 q0, q2 \n" "vpadal.u8 q0, q3 \n" "vpaddl.u16 q0, q0 \n" "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding "vmovn.u16 d0, q0 \n" "vst1.32 {d0[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_ptr1), // %3 "+r"(src_ptr2), // %4 "+r"(src_ptr3) // %5 : : "q0", "q1", "q2", "q3", "memory", "cc"); } // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #24 \n" "vmov d2, d3 \n" // order d0, d1, d2 "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : : "d0", "d1", "d2", "d3", "memory", "cc"); } void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "vmov.u8 d24, #3 \n" "add %3, %0 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "subs %2, %2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room // when adding lines together "vmovl.u8 q8, d4 \n" "vmovl.u8 q9, d5 \n" "vmovl.u8 q10, d6 \n" "vmovl.u8 q11, d7 \n" // 3 * line_0 + line_1 "vmlal.u8 q8, d0, d24 \n" "vmlal.u8 q9, d1, d24 \n" "vmlal.u8 q10, d2, d24 \n" "vmlal.u8 q11, d3, d24 \n" // (3 * line_0 + line_1 + 2) >> 2 "vqrshrn.u16 d0, q8, #2 \n" "vqrshrn.u16 d1, q9, #2 \n" "vqrshrn.u16 d2, q10, #2 \n" "vqrshrn.u16 d3, q11, #2 \n" // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 "vmovl.u8 q8, d1 \n" "vmlal.u8 q8, d0, d24 \n" "vqrshrn.u16 d0, q8, #2 \n" // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 "vrhadd.u8 d1, d1, d2 \n" // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 "vmovl.u8 q8, d2 \n" "vmlal.u8 q8, d3, d24 \n" "vqrshrn.u16 d2, q8, #2 \n" "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_stride) // %3 : : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"); } void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "vmov.u8 d24, #3 \n" "add %3, %0 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "subs %2, %2, #24 \n" // average src line 0 with src line 1 "vrhadd.u8 q0, q0, q2 \n" "vrhadd.u8 q1, q1, q3 \n" // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 "vmovl.u8 q3, d1 \n" "vmlal.u8 q3, d0, d24 \n" "vqrshrn.u16 d0, q3, #2 \n" // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 "vrhadd.u8 d1, d1, d2 \n" // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 "vmovl.u8 q3, d2 \n" "vmlal.u8 q3, d3, d24 \n" "vqrshrn.u16 d2, q3, #2 \n" "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_stride) // %3 : : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"); } #define HAS_SCALEROWDOWN38_NEON static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; static const uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0}; static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12}; static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18}; // 32 -> 12 void ScaleRowDown38_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "vld1.8 {q3}, [%3] \n" "1: \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n" "subs %2, %2, #12 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" "vst1.8 {d4}, [%1]! \n" "vst1.32 {d5[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"(&kShuf38) // %3 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"); } // 32x3 -> 12x1 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride * 2; asm volatile( "vld1.16 {q13}, [%5] \n" "vld1.8 {q14}, [%6] \n" "vld1.8 {q15}, [%7] \n" "add %3, %0 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" "vld4.8 {d4, d5, d6, d7}, [%3]! \n" "vld4.8 {d16, d17, d18, d19}, [%4]! \n" "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // d0 = 00 10 01 11 02 12 03 13 // d1 = 40 50 41 51 42 52 43 53 "vtrn.u8 d0, d1 \n" "vtrn.u8 d4, d5 \n" "vtrn.u8 d16, d17 \n" // d2 = 20 30 21 31 22 32 23 33 // d3 = 60 70 61 71 62 72 63 73 "vtrn.u8 d2, d3 \n" "vtrn.u8 d6, d7 \n" "vtrn.u8 d18, d19 \n" // d0 = 00+10 01+11 02+12 03+13 // d2 = 40+50 41+51 42+52 43+53 "vpaddl.u8 q0, q0 \n" "vpaddl.u8 q2, q2 \n" "vpaddl.u8 q8, q8 \n" // d3 = 60+70 61+71 62+72 63+73 "vpaddl.u8 d3, d3 \n" "vpaddl.u8 d7, d7 \n" "vpaddl.u8 d19, d19 \n" // combine source lines "vadd.u16 q0, q2 \n" "vadd.u16 q0, q8 \n" "vadd.u16 d4, d3, d7 \n" "vadd.u16 d4, d19 \n" // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // + s[6 + st * 1] + s[7 + st * 1] // + s[6 + st * 2] + s[7 + st * 2]) / 6 "vqrdmulh.s16 q2, q2, q13 \n" "vmovn.u16 d4, q2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This // requires expanding from u8 to u16 as the 0,1 and 4,5 // registers are already expanded. Then do transposes // to get aligned. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 "vmovl.u8 q1, d2 \n" "vmovl.u8 q3, d6 \n" "vmovl.u8 q9, d18 \n" // combine source lines "vadd.u16 q1, q3 \n" "vadd.u16 q1, q9 \n" // d4 = xx 20 xx 30 xx 22 xx 32 // d5 = xx 21 xx 31 xx 23 xx 33 "vtrn.u32 d2, d3 \n" // d4 = xx 20 xx 21 xx 22 xx 23 // d5 = xx 30 xx 31 xx 32 xx 33 "vtrn.u16 d2, d3 \n" // 0+1+2, 3+4+5 "vadd.u16 q0, q1 \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. "vqrdmulh.s16 q0, q0, q15 \n" // Align for table lookup, vtbl requires registers to // be adjacent "vmov.u8 d2, d4 \n" "vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n" "vst1.8 {d3}, [%1]! \n" "vst1.32 {d4[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_stride), // %3 "+r"(src_ptr1) // %4 : "r"(&kMult38_Div6), // %5 "r"(&kShuf38_2), // %6 "r"(&kMult38_Div9) // %7 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"); } // 32x2 -> 12x1 void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "vld1.16 {q13}, [%4] \n" "vld1.8 {q14}, [%5] \n" "add %3, %0 \n" "1: \n" // d0 = 00 40 01 41 02 42 03 43 // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" "vld4.8 {d4, d5, d6, d7}, [%3]! \n" "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // d0 = 00 10 01 11 02 12 03 13 // d1 = 40 50 41 51 42 52 43 53 "vtrn.u8 d0, d1 \n" "vtrn.u8 d4, d5 \n" // d2 = 20 30 21 31 22 32 23 33 // d3 = 60 70 61 71 62 72 63 73 "vtrn.u8 d2, d3 \n" "vtrn.u8 d6, d7 \n" // d0 = 00+10 01+11 02+12 03+13 // d2 = 40+50 41+51 42+52 43+53 "vpaddl.u8 q0, q0 \n" "vpaddl.u8 q2, q2 \n" // d3 = 60+70 61+71 62+72 63+73 "vpaddl.u8 d3, d3 \n" "vpaddl.u8 d7, d7 \n" // combine source lines "vadd.u16 q0, q2 \n" "vadd.u16 d4, d3, d7 \n" // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 "vqrshrn.u16 d4, q2, #2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This // requires expanding from u8 to u16 as the 0,1 and 4,5 // registers are already expanded. Then do transposes // to get aligned. // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 "vmovl.u8 q1, d2 \n" "vmovl.u8 q3, d6 \n" // combine source lines "vadd.u16 q1, q3 \n" // d4 = xx 20 xx 30 xx 22 xx 32 // d5 = xx 21 xx 31 xx 23 xx 33 "vtrn.u32 d2, d3 \n" // d4 = xx 20 xx 21 xx 22 xx 23 // d5 = xx 30 xx 31 xx 32 xx 33 "vtrn.u16 d2, d3 \n" // 0+1+2, 3+4+5 "vadd.u16 q0, q1 \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. "vqrdmulh.s16 q0, q0, q13 \n" // Align for table lookup, vtbl requires registers to // be adjacent "vmov.u8 d2, d4 \n" "vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n" "vst1.8 {d3}, [%1]! \n" "vst1.32 {d4[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_stride) // %3 : "r"(&kMult38_Div6), // %4 "r"(&kShuf38_2) // %5 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"); } void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 1; asm volatile( "vmov.u8 d30, #3 \n" "1: \n" "vld1.8 {d4}, [%0]! \n" // 01234567 "vld1.8 {d5}, [%3]! \n" // 12345678 "vmovl.u8 q0, d4 \n" // 01234567 (16b) "vmovl.u8 q1, d5 \n" // 12345678 (16b) "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) "vst2.8 {d0, d1}, [%1]! \n" // store "subs %2, %2, #16 \n" // 8 sample -> 16 sample "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_temp) // %3 : : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List ); } void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride; uint8_t* dst_ptr1 = dst_ptr + dst_stride; const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp1 = src_ptr1 + 1; asm volatile( "vmov.u16 q15, #3 \n" "vmov.u8 d28, #3 \n" "1: \n" "vld1.8 {d4}, [%0]! \n" // 01234567 "vld1.8 {d5}, [%5]! \n" // 12345678 "vmovl.u8 q0, d4 \n" // 01234567 (16b) "vmovl.u8 q1, d5 \n" // 12345678 (16b) "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) "vld1.8 {d8}, [%1]! \n" "vld1.8 {d9}, [%6]! \n" "vmovl.u8 q2, d8 \n" "vmovl.u8 q3, d9 \n" "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) // e o // q1 q0 // q3 q2 "vmovq q4, q2 \n" "vmovq q5, q3 \n" "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) // e o // q5 q4 // q1 q0 "vrshrn.u16 d2, q1, #4 \n" // 2, even "vrshrn.u16 d3, q0, #4 \n" // 2, odd "vrshrn.u16 d0, q5, #4 \n" // 1, even "vrshrn.u16 d1, q4, #4 \n" // 1, odd "vst2.8 {d0, d1}, [%2]! \n" // store "vst2.8 {d2, d3}, [%3]! \n" // store "subs %4, %4, #16 \n" // 8 sample -> 16 sample "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_ptr1), // %1 "+r"(dst_ptr), // %2 "+r"(dst_ptr1), // %3 "+r"(dst_width), // %4 "+r"(src_temp), // %5 "+r"(src_temp1) // %6 : : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", "q15" // Clobber List ); } void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; asm volatile( "vmov.u16 q15, #3 \n" "1: \n" "vld1.16 {q1}, [%0]! \n" // 01234567 (16b) "vld1.16 {q0}, [%3]! \n" // 12345678 (16b) "vmovq q2, q0 \n" "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) "vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd) "vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even) "vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store "subs %2, %2, #16 \n" // 8 sample -> 16 sample "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_temp) // %3 : : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List ); } void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint16_t* src_ptr1 = src_ptr + src_stride; uint16_t* dst_ptr1 = dst_ptr + dst_stride; const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; asm volatile( "vmov.u16 q15, #3 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) "vld1.16 {q1}, [%5]! \n" // 12345678 (16b) "vmovq q2, q0 \n" "vmla.u16 q0, q1, q15 \n" // 3*near+far (odd) "vmla.u16 q1, q2, q15 \n" // 3*near+far (even) "vld1.16 {q2}, [%1]! \n" // 01234567 (16b) "vld1.16 {q3}, [%6]! \n" // 12345678 (16b) "vmovq q4, q2 \n" "vmla.u16 q2, q3, q15 \n" // 3*near+far (odd) "vmla.u16 q3, q4, q15 \n" // 3*near+far (even) "vmovq q4, q2 \n" "vmovq q5, q3 \n" "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) "vrshr.u16 q2, q1, #4 \n" // 2, even "vrshr.u16 q3, q0, #4 \n" // 2, odd "vrshr.u16 q0, q5, #4 \n" // 1, even "vrshr.u16 q1, q4, #4 \n" // 1, odd "vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store "vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store "subs %4, %4, #16 \n" // 8 sample -> 16 sample "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_ptr1), // %1 "+r"(dst_ptr), // %2 "+r"(dst_ptr1), // %3 "+r"(dst_width), // %4 "+r"(src_temp), // %5 "+r"(src_temp1) // %6 : : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q15" // Clobber List ); } void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; asm volatile( "vmov.u16 d31, #3 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" // 01234567 (16b) "vld1.16 {q1}, [%3]! \n" // 12345678 (16b) "vmovl.u16 q2, d0 \n" // 0123 (32b) "vmovl.u16 q3, d1 \n" // 4567 (32b) "vmovl.u16 q4, d2 \n" // 1234 (32b) "vmovl.u16 q5, d3 \n" // 5678 (32b) "vmlal.u16 q2, d2, d31 \n" "vmlal.u16 q3, d3, d31 \n" "vmlal.u16 q4, d0, d31 \n" "vmlal.u16 q5, d1, d31 \n" "vrshrn.u32 d0, q4, #2 \n" "vrshrn.u32 d1, q5, #2 \n" "vrshrn.u32 d2, q2, #2 \n" "vrshrn.u32 d3, q3, #2 \n" "vst2.16 {q0, q1}, [%1]! \n" // store "subs %2, %2, #16 \n" // 8 sample -> 16 sample "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_temp) // %3 : : "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List ); } void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint16_t* src_ptr1 = src_ptr + src_stride; uint16_t* dst_ptr1 = dst_ptr + dst_stride; const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; asm volatile( "vmov.u16 d31, #3 \n" "vmov.u32 q14, #3 \n" "1: \n" "vld1.16 {d0}, [%0]! \n" // 0123 (16b) "vld1.16 {d1}, [%5]! \n" // 1234 (16b) "vmovl.u16 q2, d0 \n" // 0123 (32b) "vmovl.u16 q3, d1 \n" // 1234 (32b) "vmlal.u16 q2, d1, d31 \n" "vmlal.u16 q3, d0, d31 \n" "vld1.16 {d0}, [%1]! \n" // 0123 (16b) "vld1.16 {d1}, [%6]! \n" // 1234 (16b) "vmovl.u16 q4, d0 \n" // 0123 (32b) "vmovl.u16 q5, d1 \n" // 1234 (32b) "vmlal.u16 q4, d1, d31 \n" "vmlal.u16 q5, d0, d31 \n" "vmovq q0, q4 \n" "vmovq q1, q5 \n" "vmla.u32 q4, q2, q14 \n" "vmla.u32 q5, q3, q14 \n" "vmla.u32 q2, q0, q14 \n" "vmla.u32 q3, q1, q14 \n" "vrshrn.u32 d1, q4, #4 \n" "vrshrn.u32 d0, q5, #4 \n" "vrshrn.u32 d3, q2, #4 \n" "vrshrn.u32 d2, q3, #4 \n" "vst2.16 {d0, d1}, [%2]! \n" // store "vst2.16 {d2, d3}, [%3]! \n" // store "subs %4, %4, #8 \n" // 4 sample -> 8 sample "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_ptr1), // %1 "+r"(dst_ptr), // %2 "+r"(dst_ptr1), // %3 "+r"(dst_width), // %4 "+r"(src_temp), // %5 "+r"(src_temp1) // %6 : : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", "d31" // Clobber List ); } void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 2; asm volatile( "vmov.u8 d30, #3 \n" "1: \n" "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v) "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) "vst2.16 {d0, d1}, [%1]! \n" // store "subs %2, %2, #8 \n" // 4 uv -> 8 uv "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_temp) // %3 : : "memory", "cc", "q0", "q1", "q2", "d30" // Clobber List ); } void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride; uint8_t* dst_ptr1 = dst_ptr + dst_stride; const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp1 = src_ptr1 + 2; asm volatile( "vmov.u16 q15, #3 \n" "vmov.u8 d28, #3 \n" "1: \n" "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v) "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) "vld1.8 {d8}, [%1]! \n" // 00112233 (1u1v) "vld1.8 {d9}, [%6]! \n" // 11223344 (1u1v) "vmovl.u8 q2, d8 \n" // 00112233 (1u1v, 16b) "vmovl.u8 q3, d9 \n" // 11223344 (1u1v, 16b) "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) // e o // q1 q0 // q3 q2 "vmovq q4, q2 \n" "vmovq q5, q3 \n" "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) // e o // q5 q4 // q1 q0 "vrshrn.u16 d2, q1, #4 \n" // 2, even "vrshrn.u16 d3, q0, #4 \n" // 2, odd "vrshrn.u16 d0, q5, #4 \n" // 1, even "vrshrn.u16 d1, q4, #4 \n" // 1, odd "vst2.16 {d0, d1}, [%2]! \n" // store "vst2.16 {d2, d3}, [%3]! \n" // store "subs %4, %4, #8 \n" // 4 uv -> 8 uv "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_ptr1), // %1 "+r"(dst_ptr), // %2 "+r"(dst_ptr1), // %3 "+r"(dst_width), // %4 "+r"(src_temp), // %5 "+r"(src_temp1) // %6 : : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", "q15" // Clobber List ); } void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 2; asm volatile( "vmov.u16 d30, #3 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16) "vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16) "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) "vmovl.u16 q3, d2 \n" // 1122 (1u1v, 32b) "vmovl.u16 q4, d1 \n" // 2233 (1u1v, 32b) "vmovl.u16 q5, d3 \n" // 3344 (1u1v, 32b) "vmlal.u16 q2, d2, d30 \n" // 3*near+far (odd) "vmlal.u16 q3, d0, d30 \n" // 3*near+far (even) "vmlal.u16 q4, d3, d30 \n" // 3*near+far (odd) "vmlal.u16 q5, d1, d30 \n" // 3*near+far (even) "vrshrn.u32 d1, q2, #2 \n" // 3/4*near+1/4*far (odd) "vrshrn.u32 d0, q3, #2 \n" // 3/4*near+1/4*far (even) "vrshrn.u32 d3, q4, #2 \n" // 3/4*near+1/4*far (odd) "vrshrn.u32 d2, q5, #2 \n" // 3/4*near+1/4*far (even) "vst2.32 {d0, d1}, [%1]! \n" // store "vst2.32 {d2, d3}, [%1]! \n" // store "subs %2, %2, #8 \n" // 4 uv -> 8 uv "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_temp) // %3 : : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d30" // Clobber List ); } void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint16_t* src_ptr1 = src_ptr + src_stride; uint16_t* dst_ptr1 = dst_ptr + dst_stride; const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp1 = src_ptr1 + 2; asm volatile( "vmov.u16 d30, #3 \n" "vmov.u32 q14, #3 \n" "1: \n" "vld1.8 {d0}, [%0]! \n" // 0011 (1u1v) "vld1.8 {d1}, [%5]! \n" // 1122 (1u1v) "vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b) "vmovl.u16 q3, d1 \n" // 1122 (1u1v, 32b) "vmlal.u16 q2, d1, d30 \n" // 3*near+far (1, odd) "vmlal.u16 q3, d0, d30 \n" // 3*near+far (1, even) "vld1.8 {d0}, [%1]! \n" // 0011 (1u1v) "vld1.8 {d1}, [%6]! \n" // 1122 (1u1v) "vmovl.u16 q4, d0 \n" // 0011 (1u1v, 32b) "vmovl.u16 q5, d1 \n" // 1122 (1u1v, 32b) "vmlal.u16 q4, d1, d30 \n" // 3*near+far (2, odd) "vmlal.u16 q5, d0, d30 \n" // 3*near+far (2, even) "vmovq q0, q4 \n" "vmovq q1, q5 \n" "vmla.u32 q4, q2, q14 \n" // 9 3 3 1 (1, odd) "vmla.u32 q5, q3, q14 \n" // 9 3 3 1 (1, even) "vmla.u32 q2, q0, q14 \n" // 9 3 3 1 (2, odd) "vmla.u32 q3, q1, q14 \n" // 9 3 3 1 (2, even) "vrshrn.u32 d1, q4, #4 \n" // 1, odd "vrshrn.u32 d0, q5, #4 \n" // 1, even "vrshrn.u32 d3, q2, #4 \n" // 2, odd "vrshrn.u32 d2, q3, #4 \n" // 2, even "vst2.32 {d0, d1}, [%2]! \n" // store "vst2.32 {d2, d3}, [%3]! \n" // store "subs %4, %4, #4 \n" // 2 uv -> 4 uv "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_ptr1), // %1 "+r"(dst_ptr), // %2 "+r"(dst_ptr1), // %3 "+r"(dst_width), // %4 "+r"(src_temp), // %5 "+r"(src_temp1) // %6 : : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14", "d30" // Clobber List ); } // Add a row of bytes to a row of shorts. Used for box filter. // Reads 16 bytes and accumulates to 16 shorts at a time. void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { asm volatile( "1: \n" "vld1.16 {q1, q2}, [%1] \n" // load accumulator "vld1.8 {q0}, [%0]! \n" // load 16 bytes "vaddw.u8 q2, q2, d1 \n" // add "vaddw.u8 q1, q1, d0 \n" "vst1.16 {q1, q2}, [%1]! \n" // store accumulator "subs %2, %2, #16 \n" // 16 processed per loop "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 : : "memory", "cc", "q0", "q1", "q2" // Clobber List ); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping #define LOAD2_DATA8_LANE(n) \ "lsr %5, %3, #16 \n" \ "add %6, %1, %5 \n" \ "add %3, %3, %4 \n" \ "vld2.8 {d6[" #n "], d7[" #n "]}, [%6] \n" // The NEON version mimics this formula (from row_common.cc): // #define BLENDER(a, b, f) (uint8_t)((int)(a) + // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) void ScaleFilterCols_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; const uint8_t* src_tmp = src_ptr; asm volatile ( "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx "vld1.32 {q2}, [%5] \n" // 0 1 2 3 "vshl.i32 q3, q1, #2 \n" // 4 * dx "vmul.s32 q1, q1, q2 \n" // x , x + 1 * dx, x + 2 * dx, x + 3 * dx "vadd.s32 q1, q1, q0 \n" // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx "vadd.s32 q2, q1, q3 \n" "vshl.i32 q0, q3, #1 \n" // 8 * dx "1: \n" LOAD2_DATA8_LANE(0) LOAD2_DATA8_LANE(1) LOAD2_DATA8_LANE(2) LOAD2_DATA8_LANE(3) LOAD2_DATA8_LANE(4) LOAD2_DATA8_LANE(5) LOAD2_DATA8_LANE(6) LOAD2_DATA8_LANE(7) "vmov q10, q1 \n" "vmov q11, q2 \n" "vuzp.16 q10, q11 \n" "vmovl.u8 q8, d6 \n" "vmovl.u8 q9, d7 \n" "vsubl.s16 q11, d18, d16 \n" "vsubl.s16 q12, d19, d17 \n" "vmovl.u16 q13, d20 \n" "vmovl.u16 q10, d21 \n" "vmul.s32 q11, q11, q13 \n" "vmul.s32 q12, q12, q10 \n" "vrshrn.s32 d18, q11, #16 \n" "vrshrn.s32 d19, q12, #16 \n" "vadd.s16 q8, q8, q9 \n" "vmovn.s16 d6, q8 \n" "vst1.8 {d6}, [%0]! \n" // store pixels "vadd.s32 q1, q1, q0 \n" "vadd.s32 q2, q2, q0 \n" "subs %2, %2, #8 \n" // 8 processed per loop "bgt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width), // %2 "+r"(x), // %3 "+r"(dx), // %4 "+r"(tmp), // %5 "+r"(src_tmp) // %6 : : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13" ); } #undef LOAD2_DATA8_LANE // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { asm volatile( "cmp %4, #0 \n" "beq 100f \n" "add %2, %1 \n" "cmp %4, #64 \n" "beq 75f \n" "cmp %4, #128 \n" "beq 50f \n" "cmp %4, #192 \n" "beq 25f \n" "vdup.8 d5, %4 \n" "rsb %4, #256 \n" "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" "vld1.8 {q0}, [%1]! \n" "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vmull.u8 q13, d0, d4 \n" "vmull.u8 q14, d1, d4 \n" "vmlal.u8 q13, d2, d5 \n" "vmlal.u8 q14, d3, d5 \n" "vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d1, q14, #8 \n" "vst1.8 {q0}, [%0]! \n" "bgt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" "vld1.8 {q0}, [%1]! \n" "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" "vst1.8 {q0}, [%0]! \n" "bgt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" "vld1.8 {q0}, [%1]! \n" "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vst1.8 {q0}, [%0]! \n" "bgt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" "vld1.8 {q1}, [%1]! \n" "vld1.8 {q0}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" "vst1.8 {q0}, [%0]! \n" "bgt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" "vld1.8 {q0}, [%1]! \n" "subs %3, %3, #16 \n" "vst1.8 {q0}, [%0]! \n" "bgt 100b \n" "99: \n" "vst1.8 {d1[7]}, [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 "+r"(dst_width), // %3 "+r"(source_y_fraction) // %4 : : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"); } void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { (void)src_stride; asm volatile( "1: \n" "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "subs %2, %2, #8 \n" // 8 processed per loop "vmov q2, q1 \n" // load next 8 ARGB "vst2.32 {q2, q3}, [%1]! \n" // store odd pixels "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 : : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List ); } // 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! // 4a: 3e04 subs r6, #4 // 4c: f964 118d vld4.32 {d17,d19,d21,d23}, [r4]! // 50: ef64 21f4 vorr q9, q10, q10 // 54: f942 038d vst2.32 {d16-d19}, [r2]! // 58: d1f5 bne.n 46 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { (void)src_stride; asm volatile( "1: \n" "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "subs %2, %2, #8 \n" // 8 processed per loop "vrhadd.u8 q0, q0, q1 \n" // rounding half add "vrhadd.u8 q1, q2, q3 \n" // rounding half add "vst2.32 {q0, q1}, [%1]! \n" "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 : : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List ); } void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "subs %3, %3, #8 \n" // 8 processed per loop. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes "vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d2, q2, #2 \n" "vrshrn.u16 d3, q3, #2 \n" "vst4.8 {d0, d1, d2, d3}, [%2]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 "+r"(dst_width) // %3 : : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { (void)src_stride; asm volatile( "mov r12, %3, lsl #2 \n" "1: \n" "vld1.32 {d0[0]}, [%0], r12 \n" "vld1.32 {d0[1]}, [%0], r12 \n" "vld1.32 {d1[0]}, [%0], r12 \n" "vld1.32 {d1[1]}, [%0], r12 \n" "subs %2, %2, #4 \n" // 4 pixels per loop. "vst1.8 {q0}, [%1]! \n" "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 : "r"(src_stepx) // %3 : "memory", "cc", "r12", "q0"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { asm volatile( "mov r12, %4, lsl #2 \n" "add %1, %1, %0 \n" "1: \n" "vld1.8 {d0}, [%0], r12 \n" // 4 2x2 blocks -> 2x1 "vld1.8 {d1}, [%1], r12 \n" "vld1.8 {d2}, [%0], r12 \n" "vld1.8 {d3}, [%1], r12 \n" "vld1.8 {d4}, [%0], r12 \n" "vld1.8 {d5}, [%1], r12 \n" "vld1.8 {d6}, [%0], r12 \n" "vld1.8 {d7}, [%1], r12 \n" "vaddl.u8 q0, d0, d1 \n" "vaddl.u8 q1, d2, d3 \n" "vaddl.u8 q2, d4, d5 \n" "vaddl.u8 q3, d6, d7 \n" "vswp.8 d1, d2 \n" // ab_cd -> ac_bd "vswp.8 d5, d6 \n" // ef_gh -> eg_fh "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. "subs %3, %3, #4 \n" // 4 pixels per loop. "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride), // %1 "+r"(dst_argb), // %2 "+r"(dst_width) // %3 : "r"(src_stepx) // %4 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping #define LOAD1_DATA32_LANE(dn, n) \ "lsr %5, %3, #16 \n" \ "add %6, %1, %5, lsl #2 \n" \ "add %3, %3, %4 \n" \ "vld1.32 {" #dn "[" #n "]}, [%6] \n" void ScaleARGBCols_NEON(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { int tmp; const uint8_t* src_tmp = src_argb; asm volatile( "1: \n" // clang-format off LOAD1_DATA32_LANE(d0, 0) LOAD1_DATA32_LANE(d0, 1) LOAD1_DATA32_LANE(d1, 0) LOAD1_DATA32_LANE(d1, 1) LOAD1_DATA32_LANE(d2, 0) LOAD1_DATA32_LANE(d2, 1) LOAD1_DATA32_LANE(d3, 0) LOAD1_DATA32_LANE(d3, 1) // clang-format on "vst1.32 {q0, q1}, [%0]! \n" // store pixels "subs %2, %2, #8 \n" // 8 processed per loop "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 "+r"(x), // %3 "+r"(dx), // %4 "=&r"(tmp), // %5 "+r"(src_tmp) // %6 : : "memory", "cc", "q0", "q1"); } #undef LOAD1_DATA32_LANE // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping #define LOAD2_DATA32_LANE(dn1, dn2, n) \ "lsr %5, %3, #16 \n" \ "add %6, %1, %5, lsl #2 \n" \ "add %3, %3, %4 \n" \ "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n" void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; const uint8_t* src_tmp = src_argb; asm volatile ( "vdup.32 q0, %3 \n" // x "vdup.32 q1, %4 \n" // dx "vld1.32 {q2}, [%5] \n" // 0 1 2 3 "vshl.i32 q9, q1, #2 \n" // 4 * dx "vmul.s32 q1, q1, q2 \n" "vmov.i8 q3, #0x7f \n" // 0x7F "vmov.i16 q15, #0x7f \n" // 0x7F // x , x + 1 * dx, x + 2 * dx, x + 3 * dx "vadd.s32 q8, q1, q0 \n" "1: \n" // d0, d1: a // d2, d3: b LOAD2_DATA32_LANE(d0, d2, 0) LOAD2_DATA32_LANE(d0, d2, 1) LOAD2_DATA32_LANE(d1, d3, 0) LOAD2_DATA32_LANE(d1, d3, 1) "vshrn.i32 d22, q8, #9 \n" "vand.16 d22, d22, d30 \n" "vdup.8 d24, d22[0] \n" "vdup.8 d25, d22[2] \n" "vdup.8 d26, d22[4] \n" "vdup.8 d27, d22[6] \n" "vext.8 d4, d24, d25, #4 \n" "vext.8 d5, d26, d27, #4 \n" // f "veor.8 q10, q2, q3 \n" // 0x7f ^ f "vmull.u8 q11, d0, d20 \n" "vmull.u8 q12, d1, d21 \n" "vmull.u8 q13, d2, d4 \n" "vmull.u8 q14, d3, d5 \n" "vadd.i16 q11, q11, q13 \n" "vadd.i16 q12, q12, q14 \n" "vshrn.i16 d0, q11, #7 \n" "vshrn.i16 d1, q12, #7 \n" "vst1.32 {d0, d1}, [%0]! \n" // store pixels "vadd.s32 q8, q8, q9 \n" "subs %2, %2, #4 \n" // 4 processed per loop "bgt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 "+r"(x), // %3 "+r"(dx), // %4 "+r"(tmp), // %5 "+r"(src_tmp) // %6 : : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } #undef LOAD2_DATA32_LANE void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" "vld2.8 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.8 {d1, d3}, [%0]! \n" // load next 8 UV "subs %3, %3, #8 \n" // 8 processed per loop. "vpaddl.u8 q0, q0 \n" // U 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // V 16 bytes -> 8 shorts. "vld2.8 {d16, d18}, [%1]! \n" // load 8 more UV "vld2.8 {d17, d19}, [%1]! \n" // load last 8 UV "vpadal.u8 q0, q8 \n" // U 16 bytes -> 8 shorts. "vpadal.u8 q1, q9 \n" // V 16 bytes -> 8 shorts. "vrshrn.u16 d0, q0, #2 \n" // round and pack to bytes "vrshrn.u16 d1, q1, #2 \n" "vst2.8 {d0, d1}, [%2]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 "+r"(dst_width) // %3 : : "memory", "cc", "q0", "q1", "q8", "q9"); } // Reads 4 pixels at a time. void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, // pixel step uint8_t* dst_ptr, int dst_width) { const uint8_t* src1_ptr = src_ptr + src_stepx * 2; const uint8_t* src2_ptr = src_ptr + src_stepx * 4; const uint8_t* src3_ptr = src_ptr + src_stepx * 6; (void)src_stride; asm volatile( "1: \n" "vld1.16 {d0[0]}, [%0], %6 \n" "vld1.16 {d0[1]}, [%1], %6 \n" "vld1.16 {d0[2]}, [%2], %6 \n" "vld1.16 {d0[3]}, [%3], %6 \n" "subs %5, %5, #4 \n" // 4 pixels per loop. "vst1.8 {d0}, [%4]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src1_ptr), // %1 "+r"(src2_ptr), // %2 "+r"(src3_ptr), // %3 "+r"(dst_ptr), // %4 "+r"(dst_width) // %5 : "r"(src_stepx * 8) // %6 : "memory", "cc", "d0"); } #endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/scale_neon64.cc000066400000000000000000002153121416500237200216210ustar00rootroot00000000000000/* * Copyright 2014 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/row.h" #include "libyuv/scale.h" #include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) // Read 32x1 throw away even pixels, and write 16x1. void ScaleRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { (void)src_stride; asm volatile( "1: \n" // load even pixels into v0, odd into v1 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" "subs %w2, %w2, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "st1 {v1.16b}, [%1], #16 \n" // store odd pixels "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 : : "v0", "v1" // Clobber List ); } // Read 32x1 average down and write 16x1. void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { (void)src_stride; asm volatile( "1: \n" // load even pixels into v0, odd into v1 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" "subs %w2, %w2, #16 \n" // 16 processed per loop "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "st1 {v0.16b}, [%1], #16 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 : : "v0", "v1" // Clobber List ); } // Read 32x2 average down and write 16x1. void ScaleRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc "subs %w3, %w3, #16 \n" // 16 processed per loop "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "uaddlp v1.8h, v1.16b \n" "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent "uadalp v1.8h, v3.16b \n" "rshrn v0.8b, v0.8h, #2 \n" // round and pack "rshrn2 v0.16b, v1.8h, #2 \n" "st1 {v0.16b}, [%2], #16 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 "+r"(dst_width) // %3 : : "v0", "v1", "v2", "v3" // Clobber List ); } void ScaleRowDown4_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "subs %w2, %w2, #8 \n" // 8 processed per loop "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "st1 {v2.8b}, [%1], #8 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : : "v0", "v1", "v2", "v3", "memory", "cc"); } void ScaleRowDown4Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr3 = src_ptr + src_stride * 3; asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 "ld1 {v1.16b}, [%2], #16 \n" "ld1 {v2.16b}, [%3], #16 \n" "ld1 {v3.16b}, [%4], #16 \n" "subs %w5, %w5, #4 \n" "uaddlp v0.8h, v0.16b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "uadalp v0.8h, v1.16b \n" "prfm pldl1keep, [%2, 448] \n" "uadalp v0.8h, v2.16b \n" "prfm pldl1keep, [%3, 448] \n" "uadalp v0.8h, v3.16b \n" "prfm pldl1keep, [%4, 448] \n" "addp v0.8h, v0.8h, v0.8h \n" "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding "st1 {v0.s}[0], [%1], #4 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_ptr1), // %2 "+r"(src_ptr2), // %3 "+r"(src_ptr3), // %4 "+r"(dst_width) // %5 : : "v0", "v1", "v2", "v3", "memory", "cc"); } // Down scale from 4 to 3 pixels. Use the neon multilane read/write // to load up the every 4th pixel into a 4 different registers. // Point samples 32 pixels to 24 pixels. void ScaleRowDown34_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "subs %w2, %w2, #24 \n" "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : : "v0", "v1", "v2", "v3", "memory", "cc"); } void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "movi v20.8b, #3 \n" "add %3, %3, %0 \n" "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "subs %w2, %w2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room // when adding lines together "ushll v16.8h, v4.8b, #0 \n" "ushll v17.8h, v5.8b, #0 \n" "ushll v18.8h, v6.8b, #0 \n" "ushll v19.8h, v7.8b, #0 \n" // 3 * line_0 + line_1 "umlal v16.8h, v0.8b, v20.8b \n" "umlal v17.8h, v1.8b, v20.8b \n" "umlal v18.8h, v2.8b, v20.8b \n" "umlal v19.8h, v3.8b, v20.8b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // (3 * line_0 + line_1 + 2) >> 2 "uqrshrn v0.8b, v16.8h, #2 \n" "uqrshrn v1.8b, v17.8h, #2 \n" "uqrshrn v2.8b, v18.8h, #2 \n" "uqrshrn v3.8b, v19.8h, #2 \n" "prfm pldl1keep, [%3, 448] \n" // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 "ushll v16.8h, v1.8b, #0 \n" "umlal v16.8h, v0.8b, v20.8b \n" "uqrshrn v0.8b, v16.8h, #2 \n" // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 "urhadd v1.8b, v1.8b, v2.8b \n" // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 "ushll v16.8h, v2.8b, #0 \n" "umlal v16.8h, v3.8b, v20.8b \n" "uqrshrn v2.8b, v16.8h, #2 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_stride) // %3 : : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "memory", "cc"); } void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { asm volatile( "movi v20.8b, #3 \n" "add %3, %3, %0 \n" "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 "subs %w2, %w2, #24 \n" // average src line 0 with src line 1 "urhadd v0.8b, v0.8b, v4.8b \n" "urhadd v1.8b, v1.8b, v5.8b \n" "urhadd v2.8b, v2.8b, v6.8b \n" "urhadd v3.8b, v3.8b, v7.8b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2 "ushll v4.8h, v1.8b, #0 \n" "umlal v4.8h, v0.8b, v20.8b \n" "uqrshrn v0.8b, v4.8h, #2 \n" "prfm pldl1keep, [%3, 448] \n" // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1 "urhadd v1.8b, v1.8b, v2.8b \n" // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2 "ushll v4.8h, v2.8b, #0 \n" "umlal v4.8h, v3.8b, v20.8b \n" "uqrshrn v2.8b, v4.8h, #2 \n" "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width), // %2 "+r"(src_stride) // %3 : : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"); } static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0}; static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12}; static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18}; // 32 -> 12 void ScaleRowDown38_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { (void)src_stride; asm volatile( "ld1 {v3.16b}, [%3] \n" "1: \n" "ld1 {v0.16b,v1.16b}, [%0], #32 \n" "subs %w2, %w2, #12 \n" "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "st1 {v2.8b}, [%1], #8 \n" "st1 {v2.s}[2], [%1], #4 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 : "r"(&kShuf38) // %3 : "v0", "v1", "v2", "v3", "memory", "cc"); } // 32x3 -> 12x1 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride * 2; ptrdiff_t tmp_src_stride = src_stride; asm volatile( "ld1 {v29.8h}, [%5] \n" "ld1 {v30.16b}, [%6] \n" "ld1 {v31.8h}, [%7] \n" "add %2, %2, %0 \n" "1: \n" // 00 40 01 41 02 42 03 43 // 10 50 11 51 12 52 13 53 // 20 60 21 61 22 62 23 63 // 30 70 31 71 32 72 33 73 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" "subs %w4, %w4, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // 00 10 01 11 02 12 03 13 // 40 50 41 51 42 52 43 53 "trn1 v20.8b, v0.8b, v1.8b \n" "trn2 v21.8b, v0.8b, v1.8b \n" "trn1 v22.8b, v4.8b, v5.8b \n" "trn2 v23.8b, v4.8b, v5.8b \n" "trn1 v24.8b, v16.8b, v17.8b \n" "trn2 v25.8b, v16.8b, v17.8b \n" // 20 30 21 31 22 32 23 33 // 60 70 61 71 62 72 63 73 "trn1 v0.8b, v2.8b, v3.8b \n" "trn2 v1.8b, v2.8b, v3.8b \n" "trn1 v4.8b, v6.8b, v7.8b \n" "trn2 v5.8b, v6.8b, v7.8b \n" "trn1 v16.8b, v18.8b, v19.8b \n" "trn2 v17.8b, v18.8b, v19.8b \n" // 00+10 01+11 02+12 03+13 // 40+50 41+51 42+52 43+53 "uaddlp v20.4h, v20.8b \n" "uaddlp v21.4h, v21.8b \n" "uaddlp v22.4h, v22.8b \n" "uaddlp v23.4h, v23.8b \n" "uaddlp v24.4h, v24.8b \n" "uaddlp v25.4h, v25.8b \n" // 60+70 61+71 62+72 63+73 "uaddlp v1.4h, v1.8b \n" "uaddlp v5.4h, v5.8b \n" "uaddlp v17.4h, v17.8b \n" // combine source lines "add v20.4h, v20.4h, v22.4h \n" "add v21.4h, v21.4h, v23.4h \n" "add v20.4h, v20.4h, v24.4h \n" "add v21.4h, v21.4h, v25.4h \n" "add v2.4h, v1.4h, v5.4h \n" "add v2.4h, v2.4h, v17.4h \n" // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] // + s[6 + st * 1] + s[7 + st * 1] // + s[6 + st * 2] + s[7 + st * 2]) / 6 "sqrdmulh v2.8h, v2.8h, v29.8h \n" "xtn v2.8b, v2.8h \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This // requires expanding from u8 to u16 as the 0,1 and 4,5 // registers are already expanded. Then do transposes // to get aligned. // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 "ushll v16.8h, v16.8b, #0 \n" "uaddl v0.8h, v0.8b, v4.8b \n" // combine source lines "add v0.8h, v0.8h, v16.8h \n" // xx 20 xx 21 xx 22 xx 23 // xx 30 xx 31 xx 32 xx 33 "trn1 v1.8h, v0.8h, v0.8h \n" "trn2 v4.8h, v0.8h, v0.8h \n" "xtn v0.4h, v1.4s \n" "xtn v4.4h, v4.4s \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // 0+1+2, 3+4+5 "add v20.8h, v20.8h, v0.8h \n" "add v21.8h, v21.8h, v4.8h \n" "prfm pldl1keep, [%2, 448] \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. "sqrdmulh v0.8h, v20.8h, v31.8h \n" "sqrdmulh v1.8h, v21.8h, v31.8h \n" "prfm pldl1keep, [%3, 448] \n" // Align for table lookup, vtbl requires registers to be adjacent "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n" "st1 {v3.8b}, [%1], #8 \n" "st1 {v3.s}[2], [%1], #4 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(tmp_src_stride), // %2 "+r"(src_ptr1), // %3 "+r"(dst_width) // %4 : "r"(&kMult38_Div6), // %5 "r"(&kShuf38_2), // %6 "r"(&kMult38_Div9) // %7 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31", "memory", "cc"); } // 32x2 -> 12x1 void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { // TODO(fbarchard): use src_stride directly for clang 3.5+. ptrdiff_t tmp_src_stride = src_stride; asm volatile( "ld1 {v30.8h}, [%4] \n" "ld1 {v31.16b}, [%5] \n" "add %2, %2, %0 \n" "1: \n" // 00 40 01 41 02 42 03 43 // 10 50 11 51 12 52 13 53 // 20 60 21 61 22 62 23 63 // 30 70 31 71 32 72 33 73 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" "subs %w3, %w3, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 // 00 10 01 11 02 12 03 13 // 40 50 41 51 42 52 43 53 "trn1 v16.8b, v0.8b, v1.8b \n" "trn2 v17.8b, v0.8b, v1.8b \n" "trn1 v18.8b, v4.8b, v5.8b \n" "trn2 v19.8b, v4.8b, v5.8b \n" // 20 30 21 31 22 32 23 33 // 60 70 61 71 62 72 63 73 "trn1 v0.8b, v2.8b, v3.8b \n" "trn2 v1.8b, v2.8b, v3.8b \n" "trn1 v4.8b, v6.8b, v7.8b \n" "trn2 v5.8b, v6.8b, v7.8b \n" // 00+10 01+11 02+12 03+13 // 40+50 41+51 42+52 43+53 "uaddlp v16.4h, v16.8b \n" "uaddlp v17.4h, v17.8b \n" "uaddlp v18.4h, v18.8b \n" "uaddlp v19.4h, v19.8b \n" // 60+70 61+71 62+72 63+73 "uaddlp v1.4h, v1.8b \n" "uaddlp v5.4h, v5.8b \n" // combine source lines "add v16.4h, v16.4h, v18.4h \n" "add v17.4h, v17.4h, v19.4h \n" "add v2.4h, v1.4h, v5.4h \n" // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 "uqrshrn v2.8b, v2.8h, #2 \n" // Shuffle 2,3 reg around so that 2 can be added to the // 0,1 reg and 3 can be added to the 4,5 reg. This // requires expanding from u8 to u16 as the 0,1 and 4,5 // registers are already expanded. Then do transposes // to get aligned. // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 // combine source lines "uaddl v0.8h, v0.8b, v4.8b \n" // xx 20 xx 21 xx 22 xx 23 // xx 30 xx 31 xx 32 xx 33 "trn1 v1.8h, v0.8h, v0.8h \n" "trn2 v4.8h, v0.8h, v0.8h \n" "xtn v0.4h, v1.4s \n" "xtn v4.4h, v4.4s \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead // 0+1+2, 3+4+5 "add v16.8h, v16.8h, v0.8h \n" "add v17.8h, v17.8h, v4.8h \n" "prfm pldl1keep, [%2, 448] \n" // Need to divide, but can't downshift as the the value // isn't a power of 2. So multiply by 65536 / n // and take the upper 16 bits. "sqrdmulh v0.8h, v16.8h, v30.8h \n" "sqrdmulh v1.8h, v17.8h, v30.8h \n" // Align for table lookup, vtbl requires registers to // be adjacent "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n" "st1 {v3.8b}, [%1], #8 \n" "st1 {v3.s}[2], [%1], #4 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(tmp_src_stride), // %2 "+r"(dst_width) // %3 : "r"(&kMult38_Div6), // %4 "r"(&kShuf38_2) // %5 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v30", "v31", "memory", "cc"); } void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 1; asm volatile( "movi v31.8b, #3 \n" "1: \n" "ldr d0, [%0], #8 \n" // 01234567 "ldr d1, [%1], #8 \n" // 12345678 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) "st2 {v1.8b, v2.8b}, [%2], #16 \n" // store "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_temp), // %1 "+r"(dst_ptr), // %2 "+r"(dst_width) // %3 : : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List ); } void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride; uint8_t* dst_ptr1 = dst_ptr + dst_stride; const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp1 = src_ptr1 + 1; asm volatile( "movi v31.8b, #3 \n" "movi v30.8h, #3 \n" "1: \n" "ldr d0, [%0], #8 \n" // 01234567 "ldr d1, [%2], #8 \n" // 12345678 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b) "ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b) "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) "ldr d0, [%1], #8 \n" "ldr d1, [%3], #8 \n" "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b) "ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b) "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) "mov v0.16b, v4.16b \n" "mov v1.16b, v5.16b \n" "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) "rshrn v2.8b, v2.8h, #4 \n" // 2, odd "rshrn v1.8b, v3.8h, #4 \n" // 2, even "rshrn v4.8b, v4.8h, #4 \n" // 1, odd "rshrn v3.8b, v5.8h, #4 \n" // 1, even "st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1 "st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2 "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_ptr1), // %1 "+r"(src_temp), // %2 "+r"(src_temp1), // %3 "+r"(dst_ptr), // %4 "+r"(dst_ptr1), // %5 "+r"(dst_width) // %6 : : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", "v31" // Clobber List ); } void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; asm volatile( "movi v31.8h, #3 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "mov v2.16b, v0.16b \n" "mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd) "mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even) "urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd) "urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even) "st2 {v1.8h, v2.8h}, [%2], #32 \n" // store "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_temp), // %1 "+r"(dst_ptr), // %2 "+r"(dst_width) // %3 : : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List ); } void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint16_t* src_ptr1 = src_ptr + src_stride; uint16_t* dst_ptr1 = dst_ptr + dst_stride; const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; asm volatile( "movi v31.8h, #3 \n" "1: \n" "ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "mov v0.16b, v2.16b \n" "mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd) "mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even) "ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b) "ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b) "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "mov v0.16b, v4.16b \n" "mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd) "mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even) "mov v0.16b, v4.16b \n" "mov v1.16b, v5.16b \n" "mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd) "mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even) "mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd) "mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even) "urshr v2.8h, v2.8h, #4 \n" // 2, odd "urshr v1.8h, v3.8h, #4 \n" // 2, even "urshr v4.8h, v4.8h, #4 \n" // 1, odd "urshr v3.8h, v5.8h, #4 \n" // 1, even "st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1 "st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2 "subs %w6, %w6, #16 \n" // 8 sample -> 16 sample "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_ptr1), // %1 "+r"(src_temp), // %2 "+r"(src_temp1), // %3 "+r"(dst_ptr), // %4 "+r"(dst_ptr1), // %5 "+r"(dst_width) // %6 : : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v31" // Clobber List ); } void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 1; asm volatile( "movi v31.8h, #3 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) "ushll2 v3.4s, v0.8h, #0 \n" // 4567 (32b) "ushll v4.4s, v1.4h, #0 \n" // 1234 (32b) "ushll2 v5.4s, v1.8h, #0 \n" // 5678 (32b) "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) "umlal2 v3.4s, v1.8h, v31.8h \n" // 3*near+far (2, odd) "umlal v4.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (2, even) "rshrn v0.4h, v4.4s, #2 \n" // 3/4*near+1/4*far "rshrn2 v0.8h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) "rshrn v1.4h, v2.4s, #2 \n" // 3/4*near+1/4*far "rshrn2 v1.8h, v3.4s, #2 \n" // 3/4*near+1/4*far (odd) "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store "subs %w3, %w3, #16 \n" // 8 sample -> 16 sample "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_temp), // %1 "+r"(dst_ptr), // %2 "+r"(dst_width) // %3 : : "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List ); } void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint16_t* src_ptr1 = src_ptr + src_stride; uint16_t* dst_ptr1 = dst_ptr + dst_stride; const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp1 = src_ptr1 + 1; asm volatile( "movi v31.4h, #3 \n" "movi v30.4s, #3 \n" "1: \n" "ldr d0, [%0], #8 \n" // 0123 (16b) "ldr d1, [%2], #8 \n" // 1234 (16b) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "ushll v2.4s, v0.4h, #0 \n" // 0123 (32b) "ushll v3.4s, v1.4h, #0 \n" // 1234 (32b) "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) "ldr d0, [%1], #8 \n" // 0123 (16b) "ldr d1, [%3], #8 \n" // 1234 (16b) "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "ushll v4.4s, v0.4h, #0 \n" // 0123 (32b) "ushll v5.4s, v1.4h, #0 \n" // 1234 (32b) "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) "mov v0.16b, v4.16b \n" "mov v1.16b, v5.16b \n" "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) "rshrn v1.4h, v4.4s, #4 \n" // 3/4*near+1/4*far "rshrn v0.4h, v5.4s, #4 \n" // 3/4*near+1/4*far "rshrn v5.4h, v2.4s, #4 \n" // 3/4*near+1/4*far "rshrn v4.4h, v3.4s, #4 \n" // 3/4*near+1/4*far "st2 {v0.4h, v1.4h}, [%4], #16 \n" // store 1 "st2 {v4.4h, v5.4h}, [%5], #16 \n" // store 2 "subs %w6, %w6, #8 \n" // 4 sample -> 8 sample "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_ptr1), // %1 "+r"(src_temp), // %2 "+r"(src_temp1), // %3 "+r"(dst_ptr), // %4 "+r"(dst_ptr1), // %5 "+r"(dst_width) // %6 : : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", "v31" // Clobber List ); } void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 2; asm volatile( "movi v31.8b, #3 \n" "1: \n" "ldr d0, [%0], #8 \n" // 00112233 (1u1v) "ldr d1, [%1], #8 \n" // 11223344 (1u1v) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "ushll v2.8h, v0.8b, #0 \n" // 00112233 (1u1v, 16b) "ushll v3.8h, v1.8b, #0 \n" // 11223344 (1u1v, 16b) "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) "st2 {v1.4h, v2.4h}, [%2], #16 \n" // store "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_temp), // %1 "+r"(dst_ptr), // %2 "+r"(dst_width) // %3 : : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List ); } void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint8_t* src_ptr1 = src_ptr + src_stride; uint8_t* dst_ptr1 = dst_ptr + dst_stride; const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp1 = src_ptr1 + 2; asm volatile( "movi v31.8b, #3 \n" "movi v30.8h, #3 \n" "1: \n" "ldr d0, [%0], #8 \n" "ldr d1, [%2], #8 \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "ushll v2.8h, v0.8b, #0 \n" "ushll v3.8h, v1.8b, #0 \n" "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) "ldr d0, [%1], #8 \n" "ldr d1, [%3], #8 \n" "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "ushll v4.8h, v0.8b, #0 \n" "ushll v5.8h, v1.8b, #0 \n" "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) "mov v0.16b, v4.16b \n" "mov v1.16b, v5.16b \n" "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) "rshrn v2.8b, v2.8h, #4 \n" // 2, odd "rshrn v1.8b, v3.8h, #4 \n" // 2, even "rshrn v4.8b, v4.8h, #4 \n" // 1, odd "rshrn v3.8b, v5.8h, #4 \n" // 1, even "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 2 "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 1 "subs %w6, %w6, #8 \n" // 4 uv -> 8 uv "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_ptr1), // %1 "+r"(src_temp), // %2 "+r"(src_temp1), // %3 "+r"(dst_ptr), // %4 "+r"(dst_ptr1), // %5 "+r"(dst_width) // %6 : : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", "v31" // Clobber List ); } void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { const uint16_t* src_temp = src_ptr + 2; asm volatile( "movi v31.8h, #3 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b) "ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b) "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) "ushll2 v4.4s, v0.8h, #0 \n" // 2233 (1u1v, 32b) "ushll2 v5.4s, v1.8h, #0 \n" // 3344 (1u1v, 32b) "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (odd) "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (even) "umlal2 v4.4s, v1.8h, v31.8h \n" // 3*near+far (odd) "umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (even) "rshrn v2.4h, v2.4s, #2 \n" // 3/4*near+1/4*far (odd) "rshrn v1.4h, v3.4s, #2 \n" // 3/4*near+1/4*far (even) "rshrn v4.4h, v4.4s, #2 \n" // 3/4*near+1/4*far (odd) "rshrn v3.4h, v5.4s, #2 \n" // 3/4*near+1/4*far (even) "st2 {v1.2s, v2.2s}, [%2], #16 \n" // store "st2 {v3.2s, v4.2s}, [%2], #16 \n" // store "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_temp), // %1 "+r"(dst_ptr), // %2 "+r"(dst_width) // %3 : : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v31" // Clobber List ); } void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) { const uint16_t* src_ptr1 = src_ptr + src_stride; uint16_t* dst_ptr1 = dst_ptr + dst_stride; const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp1 = src_ptr1 + 2; asm volatile( "movi v31.4h, #3 \n" "movi v30.4s, #3 \n" "1: \n" "ldr d0, [%0], #8 \n" "ldr d1, [%2], #8 \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) "ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) "umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd) "umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even) "ldr d0, [%1], #8 \n" "ldr d1, [%3], #8 \n" "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "ushll v4.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b) "ushll v5.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b) "umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd) "umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even) "mov v0.16b, v4.16b \n" "mov v1.16b, v5.16b \n" "mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd) "mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even) "mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd) "mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even) "rshrn v1.4h, v2.4s, #4 \n" // 2, odd "rshrn v0.4h, v3.4s, #4 \n" // 2, even "rshrn v3.4h, v4.4s, #4 \n" // 1, odd "rshrn v2.4h, v5.4s, #4 \n" // 1, even "st2 {v0.2s, v1.2s}, [%5], #16 \n" // store 2 "st2 {v2.2s, v3.2s}, [%4], #16 \n" // store 1 "subs %w6, %w6, #4 \n" // 2 uv -> 4 uv "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_ptr1), // %1 "+r"(src_temp), // %2 "+r"(src_temp1), // %3 "+r"(dst_ptr), // %4 "+r"(dst_ptr1), // %5 "+r"(dst_width) // %6 : : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", "v31" // Clobber List ); } // Add a row of bytes to a row of shorts. Used for box filter. // Reads 16 bytes and accumulates to 16 shorts at a time. void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { asm volatile( "1: \n" "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes "uaddw2 v2.8h, v2.8h, v0.16b \n" // add "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "uaddw v1.8h, v1.8h, v0.8b \n" "st1 {v1.8h, v2.8h}, [%1], #32 \n" // store accumulator "subs %w2, %w2, #16 \n" // 16 processed per loop "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 : : "memory", "cc", "v0", "v1", "v2" // Clobber List ); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping #define LOAD2_DATA8_LANE(n) \ "lsr %5, %3, #16 \n" \ "add %6, %1, %5 \n" \ "add %3, %3, %4 \n" \ "ld2 {v4.b, v5.b}[" #n "], [%6] \n" // The NEON version mimics this formula (from row_common.cc): // #define BLENDER(a, b, f) (uint8_t)((int)(a) + // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16)) void ScaleFilterCols_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; const uint8_t* src_tmp = src_ptr; int64_t x64 = (int64_t)x; // NOLINT int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( "dup v0.4s, %w3 \n" // x "dup v1.4s, %w4 \n" // dx "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 "shl v3.4s, v1.4s, #2 \n" // 4 * dx "mul v1.4s, v1.4s, v2.4s \n" // x , x + 1 * dx, x + 2 * dx, x + 3 * dx "add v1.4s, v1.4s, v0.4s \n" // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx "add v2.4s, v1.4s, v3.4s \n" "shl v0.4s, v3.4s, #1 \n" // 8 * dx "1: \n" LOAD2_DATA8_LANE(0) LOAD2_DATA8_LANE(1) LOAD2_DATA8_LANE(2) LOAD2_DATA8_LANE(3) LOAD2_DATA8_LANE(4) LOAD2_DATA8_LANE(5) LOAD2_DATA8_LANE(6) LOAD2_DATA8_LANE(7) "mov v6.16b, v1.16b \n" "mov v7.16b, v2.16b \n" "uzp1 v6.8h, v6.8h, v7.8h \n" "ushll v4.8h, v4.8b, #0 \n" "ushll v5.8h, v5.8b, #0 \n" "ssubl v16.4s, v5.4h, v4.4h \n" "ssubl2 v17.4s, v5.8h, v4.8h \n" "ushll v7.4s, v6.4h, #0 \n" "ushll2 v6.4s, v6.8h, #0 \n" "mul v16.4s, v16.4s, v7.4s \n" "mul v17.4s, v17.4s, v6.4s \n" "rshrn v6.4h, v16.4s, #16 \n" "rshrn2 v6.8h, v17.4s, #16 \n" "add v4.8h, v4.8h, v6.8h \n" "xtn v4.8b, v4.8h \n" "st1 {v4.8b}, [%0], #8 \n" // store pixels "add v1.4s, v1.4s, v0.4s \n" "add v2.4s, v2.4s, v0.4s \n" "subs %w2, %w2, #8 \n" // 8 processed per loop "b.gt 1b \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "+r"(tmp), // %5 "+r"(src_tmp) // %6 : : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17" ); } #undef LOAD2_DATA8_LANE // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { int y_fraction = 256 - source_y_fraction; asm volatile( "cmp %w4, #0 \n" "b.eq 100f \n" "add %2, %2, %1 \n" "cmp %w4, #64 \n" "b.eq 75f \n" "cmp %w4, #128 \n" "b.eq 50f \n" "cmp %w4, #192 \n" "b.eq 25f \n" "dup v5.8b, %w4 \n" "dup v4.8b, %w5 \n" // General purpose row blend. "1: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" "subs %w3, %w3, #16 \n" "umull v6.8h, v0.8b, v4.8b \n" "umull2 v7.8h, v0.16b, v4.16b \n" "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "umlal v6.8h, v1.8b, v5.8b \n" "umlal2 v7.8h, v1.16b, v5.16b \n" "prfm pldl1keep, [%2, 448] \n" "rshrn v0.8b, v6.8h, #8 \n" "rshrn2 v0.16b, v7.8h, #8 \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" "subs %w3, %w3, #16 \n" "urhadd v0.16b, v0.16b, v1.16b \n" "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "urhadd v0.16b, v0.16b, v1.16b \n" "prfm pldl1keep, [%2, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" "subs %w3, %w3, #16 \n" "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "urhadd v0.16b, v0.16b, v1.16b \n" "prfm pldl1keep, [%2, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" "ld1 {v1.16b}, [%1], #16 \n" "ld1 {v0.16b}, [%2], #16 \n" "subs %w3, %w3, #16 \n" "urhadd v0.16b, v0.16b, v1.16b \n" "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "urhadd v0.16b, v0.16b, v1.16b \n" "prfm pldl1keep, [%2, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" "ld1 {v0.16b}, [%1], #16 \n" "subs %w3, %w3, #16 \n" "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "st1 {v0.16b}, [%0], #16 \n" "b.gt 100b \n" "99: \n" "st1 {v0.b}[15], [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2 "+r"(dst_width), // %3 "+r"(source_y_fraction), // %4 "+r"(y_fraction) // %5 : : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"); } void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { (void)src_stride; asm volatile( "1: \n" // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" "subs %w2, %w2, #8 \n" // 8 processed per loop "mov v2.16b, v3.16b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 "+r"(dst_width) // %2 : : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List ); } void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { (void)src_stride; asm volatile( "1: \n" // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n" "subs %w2, %w2, #8 \n" // 8 processed per loop "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "urhadd v1.16b, v2.16b, v3.16b \n" "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 : : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List ); } void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB "subs %w3, %w3, #8 \n" // 8 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts. "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts. "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "rshrn v0.8b, v0.8h, #2 \n" // round and pack "rshrn v1.8b, v1.8h, #2 \n" "rshrn v2.8b, v2.8h, #2 \n" "rshrn v3.8b, v3.8h, #2 \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 "+r"(dst_width) // %3 : : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { (void)src_stride; asm volatile( "1: \n" "ld1 {v0.s}[0], [%0], %3 \n" "ld1 {v0.s}[1], [%0], %3 \n" "ld1 {v0.s}[2], [%0], %3 \n" "ld1 {v0.s}[3], [%0], %3 \n" "subs %w2, %w2, #4 \n" // 4 pixels per loop. "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "st1 {v0.16b}, [%1], #16 \n" "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 : "r"((int64_t)(src_stepx * 4)) // %3 : "memory", "cc", "v0"); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. // TODO(Yang Zhang): Might be worth another optimization pass in future. // It could be upgraded to 8 pixels at a time to start with. void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { asm volatile( "add %1, %1, %0 \n" "1: \n" "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1 "ld1 {v1.8b}, [%1], %4 \n" "ld1 {v2.8b}, [%0], %4 \n" "ld1 {v3.8b}, [%1], %4 \n" "ld1 {v4.8b}, [%0], %4 \n" "ld1 {v5.8b}, [%1], %4 \n" "ld1 {v6.8b}, [%0], %4 \n" "ld1 {v7.8b}, [%1], %4 \n" "uaddl v0.8h, v0.8b, v1.8b \n" "uaddl v2.8h, v2.8b, v3.8b \n" "uaddl v4.8h, v4.8b, v5.8b \n" "uaddl v6.8h, v6.8b, v7.8b \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd "mov v0.d[1], v2.d[0] \n" "mov v2.d[0], v16.d[1] \n" "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh "mov v4.d[1], v6.d[0] \n" "mov v6.d[0], v16.d[1] \n" "prfm pldl1keep, [%1, 448] \n" "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. "subs %w3, %w3, #4 \n" // 4 pixels per loop. "st1 {v0.16b}, [%2], #16 \n" "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(src_stride), // %1 "+r"(dst_argb), // %2 "+r"(dst_width) // %3 : "r"((int64_t)(src_stepx * 4)) // %4 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); } // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping #define LOAD1_DATA32_LANE(vn, n) \ "lsr %5, %3, #16 \n" \ "add %6, %1, %5, lsl #2 \n" \ "add %3, %3, %4 \n" \ "ld1 {" #vn ".s}[" #n "], [%6] \n" void ScaleARGBCols_NEON(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { const uint8_t* src_tmp = src_argb; int64_t x64 = (int64_t)x; // NOLINT int64_t dx64 = (int64_t)dx; // NOLINT int64_t tmp64; asm volatile( "1: \n" // clang-format off LOAD1_DATA32_LANE(v0, 0) LOAD1_DATA32_LANE(v0, 1) LOAD1_DATA32_LANE(v0, 2) LOAD1_DATA32_LANE(v0, 3) LOAD1_DATA32_LANE(v1, 0) LOAD1_DATA32_LANE(v1, 1) LOAD1_DATA32_LANE(v1, 2) LOAD1_DATA32_LANE(v1, 3) "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead // clang-format on "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels "subs %w2, %w2, #8 \n" // 8 processed per loop "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "=&r"(tmp64), // %5 "+r"(src_tmp) // %6 : : "memory", "cc", "v0", "v1"); } #undef LOAD1_DATA32_LANE // TODO(Yang Zhang): Investigate less load instructions for // the x/dx stepping #define LOAD2_DATA32_LANE(vn1, vn2, n) \ "lsr %5, %3, #16 \n" \ "add %6, %1, %5, lsl #2 \n" \ "add %3, %3, %4 \n" \ "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n" void ScaleARGBFilterCols_NEON(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { int dx_offset[4] = {0, 1, 2, 3}; int* tmp = dx_offset; const uint8_t* src_tmp = src_argb; int64_t x64 = (int64_t)x; // NOLINT int64_t dx64 = (int64_t)dx; // NOLINT asm volatile ( "dup v0.4s, %w3 \n" // x "dup v1.4s, %w4 \n" // dx "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 "shl v6.4s, v1.4s, #2 \n" // 4 * dx "mul v1.4s, v1.4s, v2.4s \n" "movi v3.16b, #0x7f \n" // 0x7F "movi v4.8h, #0x7f \n" // 0x7F // x , x + 1 * dx, x + 2 * dx, x + 3 * dx "add v5.4s, v1.4s, v0.4s \n" "1: \n" // d0, d1: a // d2, d3: b LOAD2_DATA32_LANE(v0, v1, 0) LOAD2_DATA32_LANE(v0, v1, 1) LOAD2_DATA32_LANE(v0, v1, 2) LOAD2_DATA32_LANE(v0, v1, 3) "shrn v2.4h, v5.4s, #9 \n" "and v2.8b, v2.8b, v4.8b \n" "dup v16.8b, v2.b[0] \n" "dup v17.8b, v2.b[2] \n" "dup v18.8b, v2.b[4] \n" "dup v19.8b, v2.b[6] \n" "ext v2.8b, v16.8b, v17.8b, #4 \n" "ext v17.8b, v18.8b, v19.8b, #4 \n" "ins v2.d[1], v17.d[0] \n" // f "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f "umull v16.8h, v0.8b, v7.8b \n" "umull2 v17.8h, v0.16b, v7.16b \n" "umull v18.8h, v1.8b, v2.8b \n" "umull2 v19.8h, v1.16b, v2.16b \n" "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead "add v16.8h, v16.8h, v18.8h \n" "add v17.8h, v17.8h, v19.8h \n" "shrn v0.8b, v16.8h, #7 \n" "shrn2 v0.16b, v17.8h, #7 \n" "st1 {v0.4s}, [%0], #16 \n" // store pixels "add v5.4s, v5.4s, v6.4s \n" "subs %w2, %w2, #4 \n" // 4 processed per loop "b.gt 1b \n" : "+r"(dst_argb), // %0 "+r"(src_argb), // %1 "+r"(dst_width), // %2 "+r"(x64), // %3 "+r"(dx64), // %4 "+r"(tmp), // %5 "+r"(src_tmp) // %6 : : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19" ); } #undef LOAD2_DATA32_LANE // Read 16x2 average down and write 8x1. void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { asm volatile( // change the stride to row 2 pointer "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "1: \n" "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc "subs %w3, %w3, #8 \n" // 8 processed per loop "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent "uaddlp v1.4s, v1.8h \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent "uadalp v1.4s, v3.8h \n" "prfm pldl1keep, [%1, 448] \n" "rshrn v0.4h, v0.4s, #2 \n" // round and pack "rshrn2 v0.8h, v1.4s, #2 \n" "st1 {v0.8h}, [%2], #16 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 "+r"(dst_width) // %3 : : "v0", "v1", "v2", "v3" // Clobber List ); } // Read 8x2 upsample with filtering and write 16x1. // Actually reads an extra pixel, so 9x2. void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width) { asm volatile( "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "movi v0.8h, #9 \n" // constants "movi v1.4s, #3 \n" "1: \n" "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8 "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1 "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1 "subs %w3, %w3, #16 \n" // 16 dst pixels per loop "umull v16.4s, v3.4h, v0.4h \n" "umull2 v7.4s, v3.8h, v0.8h \n" "umull v18.4s, v4.4h, v0.4h \n" "umull2 v17.4s, v4.8h, v0.8h \n" "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "uaddw v16.4s, v16.4s, v6.4h \n" "uaddl2 v19.4s, v6.8h, v3.8h \n" "uaddl v3.4s, v6.4h, v3.4h \n" "uaddw2 v6.4s, v7.4s, v6.8h \n" "uaddl2 v7.4s, v5.8h, v4.8h \n" "uaddl v4.4s, v5.4h, v4.4h \n" "uaddw v18.4s, v18.4s, v5.4h \n" "prfm pldl1keep, [%1, 448] \n" "mla v16.4s, v4.4s, v1.4s \n" "mla v18.4s, v3.4s, v1.4s \n" "mla v6.4s, v7.4s, v1.4s \n" "uaddw2 v4.4s, v17.4s, v5.8h \n" "uqrshrn v16.4h, v16.4s, #4 \n" "mla v4.4s, v19.4s, v1.4s \n" "uqrshrn2 v16.8h, v6.4s, #4 \n" "uqrshrn v17.4h, v18.4s, #4 \n" "uqrshrn2 v17.8h, v4.4s, #4 \n" "st2 {v16.8h-v17.8h}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 "+r"(dst_width) // %3 : "r"(2LL), // %4 "r"(14LL) // %5 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19" // Clobber List ); } void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst, int dst_width) { asm volatile( // change the stride to row 2 pointer "add %1, %1, %0 \n" "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 UV "subs %w3, %w3, #8 \n" // 8 processed per loop. "uaddlp v0.8h, v0.16b \n" // U 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // V 16 bytes -> 8 shorts. "ld2 {v16.16b,v17.16b}, [%1], #32 \n" // load 16 "uadalp v0.8h, v16.16b \n" // U 16 bytes -> 8 shorts. "uadalp v1.8h, v17.16b \n" // V 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "rshrn v0.8b, v0.8h, #2 \n" // round and pack "prfm pldl1keep, [%1, 448] \n" "rshrn v1.8b, v1.8h, #2 \n" "st2 {v0.8b,v1.8b}, [%2], #16 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 "+r"(dst), // %2 "+r"(dst_width) // %3 : : "memory", "cc", "v0", "v1", "v16", "v17"); } // Reads 4 pixels at a time. void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, // pixel step uint8_t* dst_ptr, int dst_width) { const uint8_t* src1_ptr = src_ptr + src_stepx * 2; const uint8_t* src2_ptr = src_ptr + src_stepx * 4; const uint8_t* src3_ptr = src_ptr + src_stepx * 6; (void)src_stride; asm volatile( "1: \n" "ld1 {v0.h}[0], [%0], %6 \n" "ld1 {v1.h}[0], [%1], %6 \n" "ld1 {v2.h}[0], [%2], %6 \n" "ld1 {v3.h}[0], [%3], %6 \n" "subs %w5, %w5, #4 \n" // 4 pixels per loop. "st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n" "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(src1_ptr), // %1 "+r"(src2_ptr), // %2 "+r"(src3_ptr), // %3 "+r"(dst_ptr), // %4 "+r"(dst_width) // %5 : "r"((int64_t)(src_stepx * 8)) // %6 : "memory", "cc", "v0", "v1", "v2", "v3"); } #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/scale_uv.cc000066400000000000000000001133531416500237200211440ustar00rootroot00000000000000/* * Copyright 2020 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/scale.h" #include #include #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" // For CopyUV #include "libyuv/row.h" #include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // Macros to enable specialized scalers #ifndef HAS_SCALEUVDOWN2 #define HAS_SCALEUVDOWN2 1 #endif #ifndef HAS_SCALEUVDOWN4BOX #define HAS_SCALEUVDOWN4BOX 1 #endif #ifndef HAS_SCALEUVDOWNEVEN #define HAS_SCALEUVDOWNEVEN 1 #endif #ifndef HAS_SCALEUVBILINEARDOWN #define HAS_SCALEUVBILINEARDOWN 1 #endif #ifndef HAS_SCALEUVBILINEARUP #define HAS_SCALEUVBILINEARUP 1 #endif #ifndef HAS_UVCOPY #define HAS_UVCOPY 1 #endif #ifndef HAS_SCALEPLANEVERTICAL #define HAS_SCALEPLANEVERTICAL 1 #endif static __inline int Abs(int v) { return v >= 0 ? v : -v; } // ScaleUV, 1/2 // This is an optimized version for scaling down a UV to 1/2 of // its original size. #if HAS_SCALEUVDOWN2 static void ScaleUVDown2(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_uv, uint8_t* dst_uv, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; int row_stride = src_stride * (dy >> 16); void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width) = filtering == kFilterNone ? ScaleUVRowDown2_C : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_C : ScaleUVRowDown2Box_C); (void)src_width; (void)src_height; (void)dx; assert(dx == 65536 * 2); // Test scale factor of 2. assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. // Advance to odd row, even column. if (filtering == kFilterBilinear) { src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2; } else { src_uv += (y >> 16) * (int64_t)src_stride + ((x >> 16) - 1) * 2; } #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && filtering) { ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; } } #endif #if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) if (TestCpuFlag(kCpuHasAVX2) && filtering) { ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; if (IS_ALIGNED(dst_width, 8)) { ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; } } #endif #if defined(HAS_SCALEUVROWDOWN2BOX_NEON) if (TestCpuFlag(kCpuHasNEON) && filtering) { ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; } } #endif // This code is not enabled. Only box filter is available at this time. #if defined(HAS_SCALEUVROWDOWN2_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ScaleUVRowDown2 = filtering == kFilterNone ? ScaleUVRowDown2_Any_SSSE3 : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_SSSE3 : ScaleUVRowDown2Box_Any_SSSE3); if (IS_ALIGNED(dst_width, 2)) { ScaleUVRowDown2 = filtering == kFilterNone ? ScaleUVRowDown2_SSSE3 : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_SSSE3 : ScaleUVRowDown2Box_SSSE3); } } #endif // This code is not enabled. Only box filter is available at this time. #if defined(HAS_SCALEUVROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleUVRowDown2 = filtering == kFilterNone ? ScaleUVRowDown2_Any_NEON : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_NEON : ScaleUVRowDown2Box_Any_NEON); if (IS_ALIGNED(dst_width, 8)) { ScaleUVRowDown2 = filtering == kFilterNone ? ScaleUVRowDown2_NEON : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_NEON : ScaleUVRowDown2Box_NEON); } } #endif #if defined(HAS_SCALEUVROWDOWN2_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleUVRowDown2 = filtering == kFilterNone ? ScaleUVRowDown2_Any_MMI : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MMI : ScaleUVRowDown2Box_Any_MMI); if (IS_ALIGNED(dst_width, 2)) { ScaleUVRowDown2 = filtering == kFilterNone ? ScaleUVRowDown2_MMI : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MMI : ScaleUVRowDown2Box_MMI); } } #endif #if defined(HAS_SCALEUVROWDOWN2_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleUVRowDown2 = filtering == kFilterNone ? ScaleUVRowDown2_Any_MSA : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_Any_MSA : ScaleUVRowDown2Box_Any_MSA); if (IS_ALIGNED(dst_width, 2)) { ScaleUVRowDown2 = filtering == kFilterNone ? ScaleUVRowDown2_MSA : (filtering == kFilterLinear ? ScaleUVRowDown2Linear_MSA : ScaleUVRowDown2Box_MSA); } } #endif if (filtering == kFilterLinear) { src_stride = 0; } for (j = 0; j < dst_height; ++j) { ScaleUVRowDown2(src_uv, src_stride, dst_uv, dst_width); src_uv += row_stride; dst_uv += dst_stride; } } #endif // HAS_SCALEUVDOWN2 // ScaleUV, 1/4 // This is an optimized version for scaling down a UV to 1/4 of // its original size. #if HAS_SCALEUVDOWN4BOX static void ScaleUVDown4Box(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_uv, uint8_t* dst_uv, int x, int dx, int y, int dy) { int j; // Allocate 2 rows of UV. const int kRowSize = (dst_width * 2 * 2 + 15) & ~15; align_buffer_64(row, kRowSize * 2); int row_stride = src_stride * (dy >> 16); void (*ScaleUVRowDown2)(const uint8_t* src_uv, ptrdiff_t src_stride, uint8_t* dst_uv, int dst_width) = ScaleUVRowDown2Box_C; // Advance to odd row, even column. src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2; (void)src_width; (void)src_height; (void)dx; assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. #if defined(HAS_SCALEUVROWDOWN2BOX_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { ScaleUVRowDown2 = ScaleUVRowDown2Box_SSSE3; } } #endif #if defined(HAS_SCALEUVROWDOWN2BOX_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_AVX2; if (IS_ALIGNED(dst_width, 8)) { ScaleUVRowDown2 = ScaleUVRowDown2Box_AVX2; } } #endif #if defined(HAS_SCALEUVROWDOWN2BOX_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleUVRowDown2 = ScaleUVRowDown2Box_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleUVRowDown2 = ScaleUVRowDown2Box_NEON; } } #endif for (j = 0; j < dst_height; ++j) { ScaleUVRowDown2(src_uv, src_stride, row, dst_width * 2); ScaleUVRowDown2(src_uv + src_stride * 2, src_stride, row + kRowSize, dst_width * 2); ScaleUVRowDown2(row, kRowSize, dst_uv, dst_width); src_uv += row_stride; dst_uv += dst_stride; } free_aligned_buffer_64(row); } #endif // HAS_SCALEUVDOWN4BOX // ScaleUV Even // This is an optimized version for scaling down a UV to even // multiple of its original size. #if HAS_SCALEUVDOWNEVEN static void ScaleUVDownEven(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_uv, uint8_t* dst_uv, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; int col_step = dx >> 16; int row_stride = (dy >> 16) * (int64_t)src_stride; void (*ScaleUVRowDownEven)(const uint8_t* src_uv, ptrdiff_t src_stride, int src_step, uint8_t* dst_uv, int dst_width) = filtering ? ScaleUVRowDownEvenBox_C : ScaleUVRowDownEven_C; (void)src_width; (void)src_height; assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); src_uv += (y >> 16) * (int64_t)src_stride + (x >> 16) * 2; #if defined(HAS_SCALEUVROWDOWNEVEN_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_SSSE3 : ScaleUVRowDownEven_Any_SSSE3; if (IS_ALIGNED(dst_width, 4)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_SSE2 : ScaleUVRowDownEven_SSSE3; } } #endif #if defined(HAS_SCALEUVROWDOWNEVEN_NEON) if (TestCpuFlag(kCpuHasNEON) && !filtering) { ScaleUVRowDownEven = ScaleUVRowDownEven_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { ScaleUVRowDownEven = ScaleUVRowDownEven_NEON; } } #endif // TODO(fbarchard): Enable Box filter #if defined(HAS_SCALEUVROWDOWNEVENBOX_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_NEON : ScaleUVRowDownEven_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_NEON : ScaleUVRowDownEven_NEON; } } #endif #if defined(HAS_SCALEUVROWDOWNEVEN_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_MMI : ScaleUVRowDownEven_Any_MMI; if (IS_ALIGNED(dst_width, 2)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_MMI : ScaleUVRowDownEven_MMI; } } #endif #if defined(HAS_SCALEUVROWDOWNEVEN_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_Any_MSA : ScaleUVRowDownEven_Any_MSA; if (IS_ALIGNED(dst_width, 4)) { ScaleUVRowDownEven = filtering ? ScaleUVRowDownEvenBox_MSA : ScaleUVRowDownEven_MSA; } } #endif if (filtering == kFilterLinear) { src_stride = 0; } for (j = 0; j < dst_height; ++j) { ScaleUVRowDownEven(src_uv, src_stride, col_step, dst_uv, dst_width); src_uv += row_stride; dst_uv += dst_stride; } } #endif // Scale UV down with bilinear interpolation. #if HAS_SCALEUVBILINEARDOWN static void ScaleUVBilinearDown(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_uv, uint8_t* dst_uv, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleUVFilterCols64_C : ScaleUVFilterCols_C; int64_t xlast = x + (int64_t)(dst_width - 1) * dx; int64_t xl = (dx >= 0) ? x : xlast; int64_t xr = (dx >= 0) ? xlast : x; int clip_src_width; xl = (xl >> 16) & ~3; // Left edge aligned. xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels. xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel. if (xr > src_width) { xr = src_width; } clip_src_width = (int)(xr - xl) * 2; // Width aligned to 2. src_uv += xl * 2; x -= (int)(xl << 16); #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(clip_src_width, 16)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(clip_src_width, 32)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(clip_src_width, 16)) { InterpolateRow = InterpolateRow_NEON; } } #endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; if (IS_ALIGNED(clip_src_width, 32)) { InterpolateRow = InterpolateRow_MSA; } } #endif #if defined(HAS_SCALEUVFILTERCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; } #endif #if defined(HAS_SCALEUVFILTERCOLS_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; if (IS_ALIGNED(dst_width, 4)) { ScaleUVFilterCols = ScaleUVFilterCols_NEON; } } #endif #if defined(HAS_SCALEUVFILTERCOLS_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; if (IS_ALIGNED(dst_width, 8)) { ScaleUVFilterCols = ScaleUVFilterCols_MSA; } } #endif // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // Allocate a row of UV. { align_buffer_64(row, clip_src_width * 2); const int max_y = (src_height - 1) << 16; if (y > max_y) { y = max_y; } for (j = 0; j < dst_height; ++j) { int yi = y >> 16; const uint8_t* src = src_uv + yi * (int64_t)src_stride; if (filtering == kFilterLinear) { ScaleUVFilterCols(dst_uv, src, dst_width, x, dx); } else { int yf = (y >> 8) & 255; InterpolateRow(row, src, src_stride, clip_src_width, yf); ScaleUVFilterCols(dst_uv, row, dst_width, x, dx); } dst_uv += dst_stride; y += dy; if (y > max_y) { y = max_y; } } free_aligned_buffer_64(row); } } #endif // Scale UV up with bilinear interpolation. #if HAS_SCALEUVBILINEARUP static void ScaleUVBilinearUp(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_uv, uint8_t* dst_uv, int x, int dx, int y, int dy, enum FilterMode filtering) { int j; void (*InterpolateRow)(uint8_t * dst_uv, const uint8_t* src_uv, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; void (*ScaleUVFilterCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width, int x, int dx) = filtering ? ScaleUVFilterCols_C : ScaleUVCols_C; const int max_y = (src_height - 1) << 16; #if defined(HAS_INTERPOLATEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { InterpolateRow = InterpolateRow_Any_SSSE3; if (IS_ALIGNED(dst_width, 8)) { InterpolateRow = InterpolateRow_SSSE3; } } #endif #if defined(HAS_INTERPOLATEROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { InterpolateRow = InterpolateRow_Any_AVX2; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_AVX2; } } #endif #if defined(HAS_INTERPOLATEROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { InterpolateRow = InterpolateRow_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { InterpolateRow = InterpolateRow_NEON; } } #endif #if defined(HAS_INTERPOLATEROW_MMI) if (TestCpuFlag(kCpuHasMMI)) { InterpolateRow = InterpolateRow_Any_MMI; if (IS_ALIGNED(dst_width, 4)) { InterpolateRow = InterpolateRow_MMI; } } #endif #if defined(HAS_INTERPOLATEROW_MSA) if (TestCpuFlag(kCpuHasMSA)) { InterpolateRow = InterpolateRow_Any_MSA; if (IS_ALIGNED(dst_width, 16)) { InterpolateRow = InterpolateRow_MSA; } } #endif if (src_width >= 32768) { ScaleUVFilterCols = filtering ? ScaleUVFilterCols64_C : ScaleUVCols64_C; } #if defined(HAS_SCALEUVFILTERCOLS_SSSE3) if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleUVFilterCols = ScaleUVFilterCols_SSSE3; } #endif #if defined(HAS_SCALEUVFILTERCOLS_NEON) if (filtering && TestCpuFlag(kCpuHasNEON)) { ScaleUVFilterCols = ScaleUVFilterCols_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleUVFilterCols = ScaleUVFilterCols_NEON; } } #endif #if defined(HAS_SCALEUVFILTERCOLS_MSA) if (filtering && TestCpuFlag(kCpuHasMSA)) { ScaleUVFilterCols = ScaleUVFilterCols_Any_MSA; if (IS_ALIGNED(dst_width, 16)) { ScaleUVFilterCols = ScaleUVFilterCols_MSA; } } #endif #if defined(HAS_SCALEUVCOLS_SSSE3) if (!filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleUVFilterCols = ScaleUVCols_SSSE3; } #endif #if defined(HAS_SCALEUVCOLS_NEON) if (!filtering && TestCpuFlag(kCpuHasNEON)) { ScaleUVFilterCols = ScaleUVCols_Any_NEON; if (IS_ALIGNED(dst_width, 16)) { ScaleUVFilterCols = ScaleUVCols_NEON; } } #endif #if defined(HAS_SCALEUVCOLS_MMI) if (!filtering && TestCpuFlag(kCpuHasMMI)) { ScaleUVFilterCols = ScaleUVCols_Any_MMI; if (IS_ALIGNED(dst_width, 1)) { ScaleUVFilterCols = ScaleUVCols_MMI; } } #endif #if defined(HAS_SCALEUVCOLS_MSA) if (!filtering && TestCpuFlag(kCpuHasMSA)) { ScaleUVFilterCols = ScaleUVCols_Any_MSA; if (IS_ALIGNED(dst_width, 8)) { ScaleUVFilterCols = ScaleUVCols_MSA; } } #endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleUVFilterCols = ScaleUVColsUp2_C; #if defined(HAS_SCALEUVCOLSUP2_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { ScaleUVFilterCols = ScaleUVColsUp2_SSSE3; } #endif #if defined(HAS_SCALEUVCOLSUP2_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { ScaleUVFilterCols = ScaleUVColsUp2_MMI; } #endif } if (y > max_y) { y = max_y; } { int yi = y >> 16; const uint8_t* src = src_uv + yi * (int64_t)src_stride; // Allocate 2 rows of UV. const int kRowSize = (dst_width * 2 + 15) & ~15; align_buffer_64(row, kRowSize * 2); uint8_t* rowptr = row; int rowstride = kRowSize; int lasty = yi; ScaleUVFilterCols(rowptr, src, dst_width, x, dx); if (src_height > 1) { src += src_stride; } ScaleUVFilterCols(rowptr + rowstride, src, dst_width, x, dx); src += src_stride; for (j = 0; j < dst_height; ++j) { yi = y >> 16; if (yi != lasty) { if (y > max_y) { y = max_y; yi = y >> 16; src = src_uv + yi * (int64_t)src_stride; } if (yi != lasty) { ScaleUVFilterCols(rowptr, src, dst_width, x, dx); rowptr += rowstride; rowstride = -rowstride; lasty = yi; src += src_stride; } } if (filtering == kFilterLinear) { InterpolateRow(dst_uv, rowptr, 0, dst_width * 2, 0); } else { int yf = (y >> 8) & 255; InterpolateRow(dst_uv, rowptr, rowstride, dst_width * 2, yf); } dst_uv += dst_stride; y += dy; } free_aligned_buffer_64(row); } } #endif // HAS_SCALEUVBILINEARUP // Scale UV, horizontally up by 2 times. // Uses linear filter horizontally, nearest vertically. // This is an optimized version for scaling up a plane to 2 times of // its original width, using linear interpolation. // This is used to scale U and V planes of NV16 to NV24. void ScaleUVLinearUp2(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_uv, uint8_t* dst_uv) { void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) = ScaleUVRowUp2_Linear_Any_C; int i; int y; int dy; // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); #ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 if (TestCpuFlag(kCpuHasSSSE3)) { ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3; } #endif #ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2; } #endif #ifdef HAS_SCALEUVROWUP2LINEAR_NEON if (TestCpuFlag(kCpuHasNEON)) { ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON; } #endif if (dst_height == 1) { ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv, dst_width); } else { dy = FixedDiv(src_height - 1, dst_height - 1); y = (1 << 15) - 1; for (i = 0; i < dst_height; ++i) { ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width); dst_uv += dst_stride; y += dy; } } } // Scale plane, up by 2 times. // This is an optimized version for scaling up a plane to 2 times of // its original size, using bilinear interpolation. // This is used to scale U and V planes of NV12 to NV24. void ScaleUVBilinearUp2(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_ptr, uint8_t* dst_ptr) { void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_Any_C; int x; // This function can only scale up by 2 times. assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); #ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 if (TestCpuFlag(kCpuHasSSSE3)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3; } #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2; } #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_NEON if (TestCpuFlag(kCpuHasNEON)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON; } #endif Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); dst_ptr += dst_stride; for (x = 0; x < src_height - 1; ++x) { Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); src_ptr += src_stride; // TODO(fbarchard): Test performance of writing one row of destination at a // time. dst_ptr += 2 * dst_stride; } if (!(dst_height & 1)) { Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); } } // Scale 16 bit UV, horizontally up by 2 times. // Uses linear filter horizontally, nearest vertically. // This is an optimized version for scaling up a plane to 2 times of // its original width, using linear interpolation. // This is used to scale U and V planes of P210 to P410. void ScaleUVLinearUp2_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_uv, uint16_t* dst_uv) { void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) = ScaleUVRowUp2_Linear_16_Any_C; int i; int y; int dy; // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); #ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2 if (TestCpuFlag(kCpuHasSSE2)) { ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2; } #endif #ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2; } #endif #ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON; } #endif if (dst_height == 1) { ScaleRowUp(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride, dst_uv, dst_width); } else { dy = FixedDiv(src_height - 1, dst_height - 1); y = (1 << 15) - 1; for (i = 0; i < dst_height; ++i) { ScaleRowUp(src_uv + (y >> 16) * (int64_t)src_stride, dst_uv, dst_width); dst_uv += dst_stride; y += dy; } } } // Scale 16 bit UV, up by 2 times. // This is an optimized version for scaling up a plane to 2 times of // its original size, using bilinear interpolation. // This is used to scale U and V planes of P010 to P410. void ScaleUVBilinearUp2_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16_t* src_ptr, uint16_t* dst_ptr) { void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = ScaleUVRowUp2_Bilinear_16_Any_C; int x; // This function can only scale up by 2 times. assert(src_width == ((dst_width + 1) / 2)); assert(src_height == ((dst_height + 1) / 2)); #ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2 if (TestCpuFlag(kCpuHasSSE2)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2; } #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2; } #endif #ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON; } #endif Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); dst_ptr += dst_stride; for (x = 0; x < src_height - 1; ++x) { Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); src_ptr += src_stride; // TODO(fbarchard): Test performance of writing one row of destination at a // time. dst_ptr += 2 * dst_stride; } if (!(dst_height & 1)) { Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); } } // Scale UV to/from any dimensions, without interpolation. // Fixed point math is used for performance: The upper 16 bits // of x and dx is the integer part of the source position and // the lower 16 bits are the fixed decimal part. static void ScaleUVSimple(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8_t* src_uv, uint8_t* dst_uv, int x, int dx, int y, int dy) { int j; void (*ScaleUVCols)(uint8_t * dst_uv, const uint8_t* src_uv, int dst_width, int x, int dx) = (src_width >= 32768) ? ScaleUVCols64_C : ScaleUVCols_C; (void)src_height; #if defined(HAS_SCALEUVCOLS_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { ScaleUVCols = ScaleUVCols_SSSE3; } #endif #if defined(HAS_SCALEUVCOLS_NEON) if (TestCpuFlag(kCpuHasNEON)) { ScaleUVCols = ScaleUVCols_Any_NEON; if (IS_ALIGNED(dst_width, 8)) { ScaleUVCols = ScaleUVCols_NEON; } } #endif #if defined(HAS_SCALEUVCOLS_MMI) if (TestCpuFlag(kCpuHasMMI)) { ScaleUVCols = ScaleUVCols_Any_MMI; if (IS_ALIGNED(dst_width, 1)) { ScaleUVCols = ScaleUVCols_MMI; } } #endif #if defined(HAS_SCALEUVCOLS_MSA) if (TestCpuFlag(kCpuHasMSA)) { ScaleUVCols = ScaleUVCols_Any_MSA; if (IS_ALIGNED(dst_width, 4)) { ScaleUVCols = ScaleUVCols_MSA; } } #endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleUVCols = ScaleUVColsUp2_C; #if defined(HAS_SCALEUVCOLSUP2_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(dst_width, 8)) { ScaleUVCols = ScaleUVColsUp2_SSSE3; } #endif #if defined(HAS_SCALEUVCOLSUP2_MMI) if (TestCpuFlag(kCpuHasMMI) && IS_ALIGNED(dst_width, 4)) { ScaleUVCols = ScaleUVColsUp2_MMI; } #endif } for (j = 0; j < dst_height; ++j) { ScaleUVCols(dst_uv, src_uv + (y >> 16) * (int64_t)src_stride, dst_width, x, dx); dst_uv += dst_stride; y += dy; } } // Copy UV with optional flipping #if HAS_UVCOPY static int UVCopy(const uint8_t* src_uv, int src_stride_uv, uint8_t* dst_uv, int dst_stride_uv, int width, int height) { if (!src_uv || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv; src_stride_uv = -src_stride_uv; } CopyPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height); return 0; } static int UVCopy_16(const uint16_t* src_uv, int src_stride_uv, uint16_t* dst_uv, int dst_stride_uv, int width, int height) { if (!src_uv || !dst_uv || width <= 0 || height == 0) { return -1; } // Negative height means invert the image. if (height < 0) { height = -height; src_uv = src_uv + (height - 1) * (int64_t)src_stride_uv; src_stride_uv = -src_stride_uv; } CopyPlane_16(src_uv, src_stride_uv, dst_uv, dst_stride_uv, width * 2, height); return 0; } #endif // HAS_UVCOPY // Scale a UV plane (from NV12) // This function in turn calls a scaling function // suitable for handling the desired resolutions. static void ScaleUV(const uint8_t* src, int src_stride, int src_width, int src_height, uint8_t* dst, int dst_stride, int dst_width, int dst_height, int clip_x, int clip_y, int clip_width, int clip_height, enum FilterMode filtering) { // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; int dx = 0; int dy = 0; // UV does not support box filter yet, but allow the user to pass it. // Simplify filtering when possible. filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, filtering); // Negative src_height means invert the image. if (src_height < 0) { src_height = -src_height; src = src + (src_height - 1) * (int64_t)src_stride; src_stride = -src_stride; } ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, &dx, &dy); src_width = Abs(src_width); if (clip_x) { int64_t clipf = (int64_t)(clip_x)*dx; x += (clipf & 0xffff); src += (clipf >> 16) * 2; dst += clip_x * 2; } if (clip_y) { int64_t clipf = (int64_t)(clip_y)*dy; y += (clipf & 0xffff); src += (clipf >> 16) * (int64_t)src_stride; dst += clip_y * dst_stride; } // Special case for integer step values. if (((dx | dy) & 0xffff) == 0) { if (!dx || !dy) { // 1 pixel wide and/or tall. filtering = kFilterNone; } else { // Optimized even scale down. ie 2, 4, 6, 8, 10x. if (!(dx & 0x10000) && !(dy & 0x10000)) { #if HAS_SCALEUVDOWN2 if (dx == 0x20000) { // Optimized 1/2 downsample. ScaleUVDown2(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy, filtering); return; } #endif #if HAS_SCALEUVDOWN4BOX if (dx == 0x40000 && filtering == kFilterBox) { // Optimized 1/4 box downsample. ScaleUVDown4Box(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy); return; } #endif #if HAS_SCALEUVDOWNEVEN ScaleUVDownEven(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy, filtering); return; #endif } // Optimized odd scale down. ie 3, 5, 7, 9x. if ((dx & 0x10000) && (dy & 0x10000)) { filtering = kFilterNone; #ifdef HAS_UVCOPY if (dx == 0x10000 && dy == 0x10000) { // Straight copy. UVCopy(src + (y >> 16) * (int64_t)src_stride + (x >> 16) * 2, src_stride, dst, dst_stride, clip_width, clip_height); return; } #endif } } } // HAS_SCALEPLANEVERTICAL if (dx == 0x10000 && (x & 0xffff) == 0) { // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical(src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, y, dy, 4, filtering); return; } if (filtering && (dst_width + 1) / 2 == src_width) { ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst); return; } if ((clip_height + 1) / 2 == src_height && (clip_width + 1) / 2 == src_width && (filtering == kFilterBilinear || filtering == kFilterBox)) { ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst); return; } #if HAS_SCALEUVBILINEARUP if (filtering && dy < 65536) { ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy, filtering); return; } #endif #if HAS_SCALEUVBILINEARDOWN if (filtering) { ScaleUVBilinearDown(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy, filtering); return; } #endif ScaleUVSimple(src_width, src_height, clip_width, clip_height, src_stride, dst_stride, src, dst, x, dx, y, dy); } // Scale an UV image. LIBYUV_API int UVScale(const uint8_t* src_uv, int src_stride_uv, int src_width, int src_height, uint8_t* dst_uv, int dst_stride_uv, int dst_width, int dst_height, enum FilterMode filtering) { if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) { return -1; } ScaleUV(src_uv, src_stride_uv, src_width, src_height, dst_uv, dst_stride_uv, dst_width, dst_height, 0, 0, dst_width, dst_height, filtering); return 0; } // Scale a 16 bit UV image. // This function is currently incomplete, it can't handle all cases. LIBYUV_API int UVScale_16(const uint16_t* src_uv, int src_stride_uv, int src_width, int src_height, uint16_t* dst_uv, int dst_stride_uv, int dst_width, int dst_height, enum FilterMode filtering) { int dy = 0; if (!src_uv || src_width <= 0 || src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) { return -1; } // UV does not support box filter yet, but allow the user to pass it. // Simplify filtering when possible. filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height, filtering); // Negative src_height means invert the image. if (src_height < 0) { src_height = -src_height; src_uv = src_uv + (src_height - 1) * (int64_t)src_stride_uv; src_stride_uv = -src_stride_uv; } src_width = Abs(src_width); #ifdef HAS_UVCOPY if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) { if (dst_height == 1) { UVCopy_16(src_uv + ((src_height - 1) / 2) * (int64_t)src_stride_uv, src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height); } else { dy = src_height / dst_height; UVCopy_16(src_uv + ((dy - 1) / 2) * (int64_t)src_stride_uv, dy * (int64_t)src_stride_uv, dst_uv, dst_stride_uv, dst_width, dst_height); } return 0; } #endif if (filtering && (dst_width + 1) / 2 == src_width) { ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height, src_stride_uv, dst_stride_uv, src_uv, dst_uv); return 0; } if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width && (filtering == kFilterBilinear || filtering == kFilterBox)) { ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height, src_stride_uv, dst_stride_uv, src_uv, dst_uv); return 0; } return -1; } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/scale_win.cc000066400000000000000000001277611416500237200213170ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/row.h" #include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif // This module is for 32 bit Visual C x86 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ !defined(__clang__) && defined(_M_IX86) // Offsets for source bytes 0 to 9 static const uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. static const uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. static const uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128}; // Offsets for source bytes 0 to 10 static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. static const uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13}; // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. static const uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15}; // Coefficients for source bytes 0 to 10 static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; // Coefficients for source bytes 10 to 21 static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; // Coefficients for source bytes 21 to 31 static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; // Coefficients for source bytes 21 to 31 static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; static const uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128}; // Arrange words 0,3,6 into 0,1,2 static const uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; // Arrange words 0,3,6 into 3,4,5 static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128}; // Scaling values for boxes of 3x3 and 2x3 static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0}; // Arrange first value for pixels 0,1,2,3,4,5 static const uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128}; // Arrange second value for pixels 0,1,2,3,4,5 static const uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128}; // Arrange third value for pixels 0,1,2,3,4,5 static const uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128}; // Scaling values for boxes of 3x2 and 2x2 static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0}; // Reads 32 pixels, throws half away and writes 16 pixels. __declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] psrlw xmm0, 8 // isolate odd pixels. psrlw xmm1, 8 packuswb xmm0, xmm1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg wloop ret } } // Blends 32x1 rectangle to 16x1. __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 packuswb xmm4, xmm4 pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 pavgw xmm0, xmm5 // (x + 1) / 2 pavgw xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg wloop ret } } // Blends 32x2 rectangle to 16x1. __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 packuswb xmm4, xmm4 pxor xmm5, xmm5 // constant 0 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 paddw xmm0, xmm2 // vertical add paddw xmm1, xmm3 psrlw xmm0, 1 psrlw xmm1, 1 pavgw xmm0, xmm5 // (x + 1) / 2 pavgw xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg wloop pop esi ret } } #ifdef HAS_SCALEROWDOWN2_AVX2 // Reads 64 pixels, throws half away and writes 32 pixels. __declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] vpsrlw ymm0, ymm0, 8 // isolate odd pixels. vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 jg wloop vzeroupper ret } } // Blends 64x1 rectangle to 32x1. __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b vpsrlw ymm4, ymm4, 15 vpackuswb ymm4, ymm4, ymm4 vpxor ymm5, ymm5, ymm5 // constant 0 wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 jg wloop vzeroupper ret } } // For rounding, average = (sum + 2) / 4 // becomes average((sum >> 1), 0) // Blends 64x2 rectangle to 32x1. __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b vpsrlw ymm4, ymm4, 15 vpackuswb ymm4, ymm4, ymm4 vpxor ymm5, ymm5, ymm5 // constant 0 wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + esi] vmovdqu ymm3, [eax + esi + 32] lea eax, [eax + 64] vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 vpaddw ymm0, ymm0, ymm2 // vertical add vpaddw ymm1, ymm1, ymm3 vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 vpsrlw ymm1, ymm1, 1 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 jg wloop pop esi vzeroupper ret } } #endif // HAS_SCALEROWDOWN2_AVX2 // Point samples 32 pixels to 8 pixels. __declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 psrld xmm5, 24 pslld xmm5, 16 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] pand xmm0, xmm5 pand xmm1, xmm5 packuswb xmm0, xmm1 psrlw xmm0, 8 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 lea edx, [edx + 8] sub ecx, 8 jg wloop ret } } // Blends 32x4 rectangle to 8x1. __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_ptr mov esi, [esp + 8 + 8] // src_stride mov edx, [esp + 8 + 12] // dst_ptr mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 pcmpeqb xmm4, xmm4 // constant 0x0101 psrlw xmm4, 15 movdqa xmm5, xmm4 packuswb xmm4, xmm4 psllw xmm5, 3 // constant 0x0008 wloop: movdqu xmm0, [eax] // average rows movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 paddw xmm0, xmm2 // vertical add rows 0, 1 paddw xmm1, xmm3 movdqu xmm2, [eax + esi * 2] movdqu xmm3, [eax + esi * 2 + 16] pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 paddw xmm0, xmm2 // add row 2 paddw xmm1, xmm3 movdqu xmm2, [eax + edi] movdqu xmm3, [eax + edi + 16] lea eax, [eax + 32] pmaddubsw xmm2, xmm4 pmaddubsw xmm3, xmm4 paddw xmm0, xmm2 // add row 3 paddw xmm1, xmm3 phaddw xmm0, xmm1 paddw xmm0, xmm5 // + 8 for round psrlw xmm0, 4 // /16 for average of 4 * 4 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 lea edx, [edx + 8] sub ecx, 8 jg wloop pop edi pop esi ret } } #ifdef HAS_SCALEROWDOWN4_AVX2 // Point samples 64 pixels to 16 pixels. __declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 vpsrld ymm5, ymm5, 24 vpslld ymm5, ymm5, 16 wloop: vmovdqu ymm0, [eax] vmovdqu ymm1, [eax + 32] lea eax, [eax + 64] vpand ymm0, ymm0, ymm5 vpand ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpsrlw ymm0, ymm0, 8 vpackuswb ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg wloop vzeroupper ret } } // Blends 64x4 rectangle to 16x1. __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { push esi push edi mov eax, [esp + 8 + 4] // src_ptr mov esi, [esp + 8 + 8] // src_stride mov edx, [esp + 8 + 12] // dst_ptr mov ecx, [esp + 8 + 16] // dst_width lea edi, [esi + esi * 2] // src_stride * 3 vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 vpsrlw ymm4, ymm4, 15 vpsllw ymm5, ymm4, 3 // constant 0x0008 vpackuswb ymm4, ymm4, ymm4 wloop: vmovdqu ymm0, [eax] // average rows vmovdqu ymm1, [eax + 32] vmovdqu ymm2, [eax + esi] vmovdqu ymm3, [eax + esi + 32] vpmaddubsw ymm0, ymm0, ymm4 // horizontal add vpmaddubsw ymm1, ymm1, ymm4 vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 vpaddw ymm1, ymm1, ymm3 vmovdqu ymm2, [eax + esi * 2] vmovdqu ymm3, [eax + esi * 2 + 32] vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 vpaddw ymm0, ymm0, ymm2 // add row 2 vpaddw ymm1, ymm1, ymm3 vmovdqu ymm2, [eax + edi] vmovdqu ymm3, [eax + edi + 32] lea eax, [eax + 64] vpmaddubsw ymm2, ymm2, ymm4 vpmaddubsw ymm3, ymm3, ymm4 vpaddw ymm0, ymm0, ymm2 // add row 3 vpaddw ymm1, ymm1, ymm3 vphaddw ymm0, ymm0, ymm1 // mutates vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw vpaddw ymm0, ymm0, ymm5 // + 8 for round vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 vpackuswb ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 jg wloop pop edi pop esi vzeroupper ret } } #endif // HAS_SCALEROWDOWN4_AVX2 // Point samples 32 pixels to 24 pixels. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. __declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width movdqa xmm3, xmmword ptr kShuf0 movdqa xmm4, xmmword ptr kShuf1 movdqa xmm5, xmmword ptr kShuf2 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm1 palignr xmm1, xmm0, 8 pshufb xmm0, xmm3 pshufb xmm1, xmm4 pshufb xmm2, xmm5 movq qword ptr [edx], xmm0 movq qword ptr [edx + 8], xmm1 movq qword ptr [edx + 16], xmm2 lea edx, [edx + 24] sub ecx, 24 jg wloop ret } } // Blends 32x2 rectangle to 24x1 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. // Register usage: // xmm0 src_row 0 // xmm1 src_row 1 // xmm2 shuf 0 // xmm3 shuf 1 // xmm4 shuf 2 // xmm5 madd 0 // xmm6 madd 1 // xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShuf01 movdqa xmm3, xmmword ptr kShuf11 movdqa xmm4, xmmword ptr kShuf21 movdqa xmm5, xmmword ptr kMadd01 movdqa xmm6, xmmword ptr kMadd11 movdqa xmm7, xmmword ptr kRound34 wloop: movdqu xmm0, [eax] // pixels 0..7 movdqu xmm1, [eax + esi] pavgb xmm0, xmm1 pshufb xmm0, xmm2 pmaddubsw xmm0, xmm5 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 movdqu xmm0, [eax + 8] // pixels 8..15 movdqu xmm1, [eax + esi + 8] pavgb xmm0, xmm1 pshufb xmm0, xmm3 pmaddubsw xmm0, xmm6 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 movdqu xmm0, [eax + 16] // pixels 16..23 movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm1 pshufb xmm0, xmm4 movdqa xmm1, xmmword ptr kMadd21 pmaddubsw xmm0, xmm1 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 16], xmm0 lea edx, [edx + 24] sub ecx, 24 jg wloop pop esi ret } } // Note that movdqa+palign may be better than movdqu. __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShuf01 movdqa xmm3, xmmword ptr kShuf11 movdqa xmm4, xmmword ptr kShuf21 movdqa xmm5, xmmword ptr kMadd01 movdqa xmm6, xmmword ptr kMadd11 movdqa xmm7, xmmword ptr kRound34 wloop: movdqu xmm0, [eax] // pixels 0..7 movdqu xmm1, [eax + esi] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm2 pmaddubsw xmm0, xmm5 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx], xmm0 movdqu xmm0, [eax + 8] // pixels 8..15 movdqu xmm1, [eax + esi + 8] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm3 pmaddubsw xmm0, xmm6 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 8], xmm0 movdqu xmm0, [eax + 16] // pixels 16..23 movdqu xmm1, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm1, xmm0 pavgb xmm0, xmm1 pshufb xmm0, xmm4 movdqa xmm1, xmmword ptr kMadd21 pmaddubsw xmm0, xmm1 paddsw xmm0, xmm7 psrlw xmm0, 2 packuswb xmm0, xmm0 movq qword ptr [edx + 16], xmm0 lea edx, [edx+24] sub ecx, 24 jg wloop pop esi ret } } // 3/8 point sampler // Scale 32 pixels to 12 __declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width movdqa xmm4, xmmword ptr kShuf38a movdqa xmm5, xmmword ptr kShuf38b xloop: movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 lea eax, [eax + 32] pshufb xmm0, xmm4 pshufb xmm1, xmm5 paddusb xmm0, xmm1 movq qword ptr [edx], xmm0 // write 12 pixels movhlps xmm1, xmm0 movd [edx + 8], xmm1 lea edx, [edx + 12] sub ecx, 12 jg xloop ret } } // Scale 16x3 pixels to 6x1 with interpolation __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShufAc movdqa xmm3, xmmword ptr kShufAc3 movdqa xmm4, xmmword ptr kScaleAc33 pxor xmm5, xmm5 xloop: movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 movdqu xmm6, [eax + esi] movhlps xmm1, xmm0 movhlps xmm7, xmm6 punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 punpcklbw xmm6, xmm5 punpcklbw xmm7, xmm5 paddusw xmm0, xmm6 paddusw xmm1, xmm7 movdqu xmm6, [eax + esi * 2] lea eax, [eax + 16] movhlps xmm7, xmm6 punpcklbw xmm6, xmm5 punpcklbw xmm7, xmm5 paddusw xmm0, xmm6 paddusw xmm1, xmm7 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 psrldq xmm0, 2 paddusw xmm6, xmm0 psrldq xmm0, 2 paddusw xmm6, xmm0 pshufb xmm6, xmm2 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 psrldq xmm1, 2 paddusw xmm7, xmm1 psrldq xmm1, 2 paddusw xmm7, xmm1 pshufb xmm7, xmm3 paddusw xmm6, xmm7 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 packuswb xmm6, xmm6 movd [edx], xmm6 // write 6 pixels psrlq xmm6, 16 movd [edx + 2], xmm6 lea edx, [edx + 6] sub ecx, 6 jg xloop pop esi ret } } // Scale 16x2 pixels to 6x1 with interpolation __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_ptr mov ecx, [esp + 4 + 16] // dst_width movdqa xmm2, xmmword ptr kShufAb0 movdqa xmm3, xmmword ptr kShufAb1 movdqa xmm4, xmmword ptr kShufAb2 movdqa xmm5, xmmword ptr kScaleAb2 xloop: movdqu xmm0, [eax] // average 2 rows into xmm0 movdqu xmm1, [eax + esi] lea eax, [eax + 16] pavgb xmm0, xmm1 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 pshufb xmm1, xmm2 movdqa xmm6, xmm0 pshufb xmm6, xmm3 paddusw xmm1, xmm6 pshufb xmm0, xmm4 paddusw xmm1, xmm0 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 packuswb xmm1, xmm1 movd [edx], xmm1 // write 6 pixels psrlq xmm1, 16 movd [edx + 2], xmm1 lea edx, [edx + 6] sub ecx, 6 jg xloop pop esi ret } } // Reads 16 bytes and accumulates to 16 shorts at a time. __declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { __asm { mov eax, [esp + 4] // src_ptr mov edx, [esp + 8] // dst_ptr mov ecx, [esp + 12] // src_width pxor xmm5, xmm5 // sum rows xloop: movdqu xmm3, [eax] // read 16 bytes lea eax, [eax + 16] movdqu xmm0, [edx] // read 16 words from destination movdqu xmm1, [edx + 16] movdqa xmm2, xmm3 punpcklbw xmm2, xmm5 punpckhbw xmm3, xmm5 paddusw xmm0, xmm2 // sum 16 words paddusw xmm1, xmm3 movdqu [edx], xmm0 // write 16 words to destination movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 16 jg xloop ret } } #ifdef HAS_SCALEADDROW_AVX2 // Reads 32 bytes and accumulates to 32 shorts at a time. __declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { __asm { mov eax, [esp + 4] // src_ptr mov edx, [esp + 8] // dst_ptr mov ecx, [esp + 12] // src_width vpxor ymm5, ymm5, ymm5 // sum rows xloop: vmovdqu ymm3, [eax] // read 32 bytes lea eax, [eax + 32] vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck vpunpcklbw ymm2, ymm3, ymm5 vpunpckhbw ymm3, ymm3, ymm5 vpaddusw ymm0, ymm2, [edx] // sum 16 words vpaddusw ymm1, ymm3, [edx + 32] vmovdqu [edx], ymm0 // write 32 words to destination vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] sub ecx, 32 jg xloop vzeroupper ret } } #endif // HAS_SCALEADDROW_AVX2 // Constant for making pixels signed to avoid pmaddubsw // saturation. static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // Constant for making pixels unsigned and adding .5 for rounding. static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040}; // Bilinear column filtering. SSSE3 version. __declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) { __asm { push ebx push esi push edi mov edi, [esp + 12 + 4] // dst_ptr mov esi, [esp + 12 + 8] // src_ptr mov ecx, [esp + 12 + 12] // dst_width movd xmm2, [esp + 12 + 16] // x movd xmm3, [esp + 12 + 20] // dx mov eax, 0x04040000 // shuffle to line up fractions with pixel. movd xmm5, eax pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 pcmpeqb xmm7, xmm7 // generate 0x0001 psrlw xmm7, 15 pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 movdqa xmm0, xmm2 // x1 = x0 + dx paddd xmm0, xmm3 punpckldq xmm2, xmm0 // x0 x1 punpckldq xmm3, xmm3 // dx dx paddd xmm3, xmm3 // dx * 2, dx * 2 pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. xloop2: movdqa xmm1, xmm2 // x0, x1 fractions. paddd xmm2, xmm3 // x += dx movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx psrlw xmm1, 9 // 7 bit fractions. movzx ebx, word ptr [esi + edx] // 2 source x1 pixels movd xmm4, ebx pshufb xmm1, xmm5 // 0011 punpcklwd xmm0, xmm4 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. pxor xmm1, xmm6 // 0..7f and 7f..0 paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration. paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. packuswb xmm1, xmm1 // 8 bits, 2 pixels. movd ebx, xmm1 mov [edi], bx lea edi, [edi + 2] sub ecx, 2 // 2 pixels jge xloop2 xloop29: add ecx, 2 - 1 jl xloop99 // 1 pixel remainder movzx ebx, word ptr [esi + eax] // 2 source x0 pixels movd xmm0, ebx psrlw xmm2, 9 // 7 bit fractions. pshufb xmm2, xmm5 // 0011 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. pxor xmm2, xmm6 // 0..7f and 7f..0 paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 pmaddubsw xmm2, xmm0 // 16 bit paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. packuswb xmm2, xmm2 // 8 bits movd ebx, xmm2 mov [edi], bl xloop99: pop edi pop esi pop ebx ret } } // Reads 16 pixels, duplicates them and writes 32 pixels. __declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, int dx) { __asm { mov edx, [esp + 4] // dst_ptr mov eax, [esp + 8] // src_ptr mov ecx, [esp + 12] // dst_width wloop: movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 punpcklbw xmm0, xmm0 punpckhbw xmm1, xmm1 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 32 jg wloop ret } } // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { __asm { mov eax, [esp + 4] // src_argb // src_stride ignored mov edx, [esp + 12] // dst_argb mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] shufps xmm0, xmm1, 0xdd movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg wloop ret } } // Blends 8x1 rectangle to 4x1. __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { __asm { mov eax, [esp + 4] // src_argb // src_stride ignored mov edx, [esp + 12] // dst_argb mov ecx, [esp + 16] // dst_width wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] movdqa xmm2, xmm0 shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg wloop ret } } // Blends 8x2 rectangle to 4x1. __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, uint8_t* dst_argb, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_argb mov esi, [esp + 4 + 8] // src_stride mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // dst_width wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] lea eax, [eax + 32] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg wloop pop esi ret } } // Reads 4 pixels at a time. __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { __asm { push ebx push edi mov eax, [esp + 8 + 4] // src_argb // src_stride ignored mov ebx, [esp + 8 + 12] // src_stepx mov edx, [esp + 8 + 16] // dst_argb mov ecx, [esp + 8 + 20] // dst_width lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] wloop: movd xmm0, [eax] movd xmm1, [eax + ebx] punpckldq xmm0, xmm1 movd xmm2, [eax + ebx * 2] movd xmm3, [eax + edi] lea eax, [eax + ebx * 4] punpckldq xmm2, xmm3 punpcklqdq xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg wloop pop edi pop ebx ret } } // Blends four 2x2 to 4x1. __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb, ptrdiff_t src_stride, int src_stepx, uint8_t* dst_argb, int dst_width) { __asm { push ebx push esi push edi mov eax, [esp + 12 + 4] // src_argb mov esi, [esp + 12 + 8] // src_stride mov ebx, [esp + 12 + 12] // src_stepx mov edx, [esp + 12 + 16] // dst_argb mov ecx, [esp + 12 + 20] // dst_width lea esi, [eax + esi] // row1 pointer lea ebx, [ebx * 4] lea edi, [ebx + ebx * 2] wloop: movq xmm0, qword ptr [eax] // row0 4 pairs movhps xmm0, qword ptr [eax + ebx] movq xmm1, qword ptr [eax + ebx * 2] movhps xmm1, qword ptr [eax + edi] lea eax, [eax + ebx * 4] movq xmm2, qword ptr [esi] // row1 4 pairs movhps xmm2, qword ptr [esi + ebx] movq xmm3, qword ptr [esi + ebx * 2] movhps xmm3, qword ptr [esi + edi] lea esi, [esi + ebx * 4] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 jg wloop pop edi pop esi pop ebx ret } } // Column scaling unfiltered. SSE2 version. __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { __asm { push edi push esi mov edi, [esp + 8 + 4] // dst_argb mov esi, [esp + 8 + 8] // src_argb mov ecx, [esp + 8 + 12] // dst_width movd xmm2, [esp + 8 + 16] // x movd xmm3, [esp + 8 + 20] // dx pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 paddd xmm2, xmm0 paddd xmm3, xmm3 // 0, 0, 0, dx * 2 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 paddd xmm2, xmm0 // x3 x2 x1 x0 paddd xmm3, xmm3 // 0, 0, 0, dx * 4 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 pextrw eax, xmm2, 1 // get x0 integer. pextrw edx, xmm2, 3 // get x1 integer. cmp ecx, 0 jle xloop99 sub ecx, 4 jl xloop49 // 4 Pixel loop. xloop4: movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels pextrw eax, xmm2, 5 // get x2 integer. pextrw edx, xmm2, 7 // get x3 integer. paddd xmm2, xmm3 // x += dx punpckldq xmm0, xmm1 // x0 x1 movd xmm1, [esi + eax * 4] // 1 source x2 pixels movd xmm4, [esi + edx * 4] // 1 source x3 pixels pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration. punpckldq xmm1, xmm4 // x2 x3 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 movdqu [edi], xmm0 lea edi, [edi + 16] sub ecx, 4 // 4 pixels jge xloop4 xloop49: test ecx, 2 je xloop29 // 2 Pixels. movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels pextrw eax, xmm2, 5 // get x2 integer. punpckldq xmm0, xmm1 // x0 x1 movq qword ptr [edi], xmm0 lea edi, [edi + 8] xloop29: test ecx, 1 je xloop99 // 1 Pixels. movd xmm0, [esi + eax * 4] // 1 source x2 pixels movd dword ptr [edi], xmm0 xloop99: pop esi pop edi ret } } // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. // TODO(fbarchard): Port to Neon // Shuffle table for arranging 2 pixels into pairs for pmaddubsw static const uvec8 kShuffleColARGB = { 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel }; // Shuffle table for duplicating 2 fractions into 8 bytes each static const uvec8 kShuffleFractions = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { __asm { push esi push edi mov edi, [esp + 8 + 4] // dst_argb mov esi, [esp + 8 + 8] // src_argb mov ecx, [esp + 8 + 12] // dst_width movd xmm2, [esp + 8 + 16] // x movd xmm3, [esp + 8 + 20] // dx movdqa xmm4, xmmword ptr kShuffleColARGB movdqa xmm5, xmmword ptr kShuffleFractions pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. psrlw xmm6, 9 pextrw eax, xmm2, 1 // get x0 integer. preroll sub ecx, 2 jl xloop29 movdqa xmm0, xmm2 // x1 = x0 + dx paddd xmm0, xmm3 punpckldq xmm2, xmm0 // x0 x1 punpckldq xmm3, xmm3 // dx dx paddd xmm3, xmm3 // dx * 2, dx * 2 pextrw edx, xmm2, 3 // get x1 integer. preroll // 2 Pixel loop. xloop2: movdqa xmm1, xmm2 // x0, x1 fractions. paddd xmm2, xmm3 // x += dx movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels psrlw xmm1, 9 // 7 bit fractions. movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels pshufb xmm1, xmm5 // 0000000011111111 pshufb xmm0, xmm4 // arrange pixels into pairs pxor xmm1, xmm6 // 0..7f and 7f..0 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. pextrw eax, xmm2, 1 // get x0 integer. next iteration. pextrw edx, xmm2, 3 // get x1 integer. next iteration. psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. movq qword ptr [edi], xmm0 lea edi, [edi + 8] sub ecx, 2 // 2 pixels jge xloop2 xloop29: add ecx, 2 - 1 jl xloop99 // 1 pixel remainder psrlw xmm2, 9 // 7 bit fractions. movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels pshufb xmm2, xmm5 // 00000000 pshufb xmm0, xmm4 // arrange pixels into pairs pxor xmm2, xmm6 // 0..7f and 7f..0 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. psrlw xmm0, 7 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. movd [edi], xmm0 xloop99: pop edi pop esi ret } } // Reads 4 pixels, duplicates them and writes 8 pixels. __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb, const uint8_t* src_argb, int dst_width, int x, int dx) { __asm { mov edx, [esp + 4] // dst_argb mov eax, [esp + 8] // src_argb mov ecx, [esp + 12] // dst_width wloop: movdqu xmm0, [eax] lea eax, [eax + 16] movdqa xmm1, xmm0 punpckldq xmm0, xmm0 punpckhdq xmm1, xmm1 movdqu [edx], xmm0 movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg wloop ret } } // Divide num by div and return as 16.16 fixed point result. __declspec(naked) int FixedDiv_X86(int num, int div) { __asm { mov eax, [esp + 4] // num cdq // extend num to 64 bits shld edx, eax, 16 // 32.16 shl eax, 16 idiv dword ptr [esp + 8] ret } } // Divide num by div and return as 16.16 fixed point result. __declspec(naked) int FixedDiv1_X86(int num, int div) { __asm { mov eax, [esp + 4] // num mov ecx, [esp + 8] // denom cdq // extend num to 64 bits shld edx, eax, 16 // 32.16 shl eax, 16 sub eax, 0x00010001 sbb edx, 0 sub ecx, 1 idiv ecx ret } } #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/source/test.sh000077500000000000000000000017341416500237200203510ustar00rootroot00000000000000#!/bin/bash set -x function runbenchmark1 { perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 perf report | grep AVX } runbenchmark1 ABGRToI420 runbenchmark1 Android420ToI420 runbenchmark1 ARGBToI420 runbenchmark1 Convert16To8Plane runbenchmark1 ConvertToARGB runbenchmark1 ConvertToI420 runbenchmark1 CopyPlane runbenchmark1 H010ToAB30 runbenchmark1 H010ToAR30 runbenchmark1 HalfFloatPlane runbenchmark1 I010ToAB30 runbenchmark1 I010ToAR30 runbenchmark1 I420Copy runbenchmark1 I420Psnr runbenchmark1 I420Scale runbenchmark1 I420Ssim runbenchmark1 I420ToARGB runbenchmark1 I420ToNV12 runbenchmark1 I420ToUYVY runbenchmark1 I422ToI420 runbenchmark1 InitCpuFlags runbenchmark1 J420ToARGB runbenchmark1 NV12ToARGB runbenchmark1 NV12ToI420 runbenchmark1 NV12ToI420Rotate runbenchmark1 SetCpuFlags runbenchmark1 YUY2ToI420 libyuv-0.0~git20220104.b91df1a/source/video_common.cc000066400000000000000000000036461416500237200220240ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "libyuv/video_common.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif struct FourCCAliasEntry { uint32_t alias; uint32_t canonical; }; #define NUM_ALIASES 18 static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = { {FOURCC_IYUV, FOURCC_I420}, {FOURCC_YU12, FOURCC_I420}, {FOURCC_YU16, FOURCC_I422}, {FOURCC_YU24, FOURCC_I444}, {FOURCC_YUYV, FOURCC_YUY2}, {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs {FOURCC_HDYC, FOURCC_UYVY}, {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8 {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not. {FOURCC_DMB1, FOURCC_MJPG}, {FOURCC_BA81, FOURCC_BGGR}, // deprecated. {FOURCC_RGB3, FOURCC_RAW}, {FOURCC_BGR3, FOURCC_24BG}, {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB {FOURCC_CM24, FOURCC_RAW}, // kCMPixelFormat_24RGB {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555 {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565 {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551 }; // TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB. // {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc) { int i; for (i = 0; i < NUM_ALIASES; ++i) { if (kFourCCAliases[i].alias == fourcc) { return kFourCCAliases[i].canonical; } } // Not an alias, so return it as-is. return fourcc; } #ifdef __cplusplus } // extern "C" } // namespace libyuv #endif libyuv-0.0~git20220104.b91df1a/tools_libyuv/000077500000000000000000000000001416500237200202605ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/tools_libyuv/OWNERS000066400000000000000000000001001416500237200212070ustar00rootroot00000000000000mbonadei@chromium.org fbarchard@chromium.org pbos@chromium.org libyuv-0.0~git20220104.b91df1a/tools_libyuv/autoroller/000077500000000000000000000000001416500237200224505ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/tools_libyuv/autoroller/roll_deps.py000077500000000000000000000437351416500237200250240ustar00rootroot00000000000000#!/usr/bin/env python # Copyright 2017 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. # This is a modified copy of the script in # https://webrtc.googlesource.com/src/+/master/tools_webrtc/autoroller/roll_deps.py # customized for libyuv. """Script to automatically roll dependencies in the libyuv DEPS file.""" import argparse import base64 import collections import logging import os import re import subprocess import sys import urllib2 # Skip these dependencies (list without solution name prefix). DONT_AUTOROLL_THESE = [ 'src/third_party/gflags/src', ] LIBYUV_URL = 'https://chromium.googlesource.com/libyuv/libyuv' CHROMIUM_SRC_URL = 'https://chromium.googlesource.com/chromium/src' CHROMIUM_COMMIT_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s' CHROMIUM_LOG_TEMPLATE = CHROMIUM_SRC_URL + '/+log/%s' CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s' COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$') CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([0-9a-z-]+)\'$') ROLL_BRANCH_NAME = 'roll_chromium_revision' SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) CHECKOUT_SRC_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.pardir, os.pardir)) CHECKOUT_ROOT_DIR = os.path.realpath(os.path.join(CHECKOUT_SRC_DIR, os.pardir)) sys.path.append(os.path.join(CHECKOUT_SRC_DIR, 'build')) import find_depot_tools # pylint: disable=wrong-import-position find_depot_tools.add_depot_tools_to_path() CLANG_UPDATE_SCRIPT_URL_PATH = 'tools/clang/scripts/update.py' CLANG_UPDATE_SCRIPT_LOCAL_PATH = os.path.join(CHECKOUT_SRC_DIR, 'tools', 'clang', 'scripts', 'update.py') DepsEntry = collections.namedtuple('DepsEntry', 'path url revision') ChangedDep = collections.namedtuple('ChangedDep', 'path url current_rev new_rev') class RollError(Exception): pass def VarLookup(local_scope): return lambda var_name: local_scope['vars'][var_name] def ParseDepsDict(deps_content): local_scope = {} global_scope = { 'Var': VarLookup(local_scope), 'Str': lambda s: s, 'deps_os': {}, } exec(deps_content, global_scope, local_scope) return local_scope def ParseLocalDepsFile(filename): with open(filename, 'rb') as f: deps_content = f.read() return ParseDepsDict(deps_content) def ParseRemoteCrDepsFile(revision): deps_content = ReadRemoteCrFile('DEPS', revision) return ParseDepsDict(deps_content) def ParseCommitPosition(commit_message): for line in reversed(commit_message.splitlines()): m = COMMIT_POSITION_RE.match(line.strip()) if m: return int(m.group(1)) logging.error('Failed to parse commit position id from:\n%s\n', commit_message) sys.exit(-1) def _RunCommand(command, working_dir=None, ignore_exit_code=False, extra_env=None): """Runs a command and returns the output from that command. If the command fails (exit code != 0), the function will exit the process. Returns: A tuple containing the stdout and stderr outputs as strings. """ working_dir = working_dir or CHECKOUT_SRC_DIR logging.debug('CMD: %s CWD: %s', ' '.join(command), working_dir) env = os.environ.copy() if extra_env: assert all(isinstance(value, str) for value in extra_env.values()) logging.debug('extra env: %s', extra_env) env.update(extra_env) p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, cwd=working_dir, universal_newlines=True) std_output = p.stdout.read() err_output = p.stderr.read() p.wait() p.stdout.close() p.stderr.close() if not ignore_exit_code and p.returncode != 0: logging.error('Command failed: %s\n' 'stdout:\n%s\n' 'stderr:\n%s\n', ' '.join(command), std_output, err_output) sys.exit(p.returncode) return std_output, err_output def _GetBranches(): """Returns a tuple of active,branches. The 'active' is the name of the currently active branch and 'branches' is a list of all branches. """ lines = _RunCommand(['git', 'branch'])[0].split('\n') branches = [] active = '' for line in lines: if '*' in line: # The assumption is that the first char will always be the '*'. active = line[1:].strip() branches.append(active) else: branch = line.strip() if branch: branches.append(branch) return active, branches def _ReadGitilesContent(url): # Download and decode BASE64 content until # https://code.google.com/p/gitiles/issues/detail?id=7 is fixed. base64_content = ReadUrlContent(url + '?format=TEXT') return base64.b64decode(base64_content[0]) def ReadRemoteCrFile(path_below_src, revision): """Reads a remote Chromium file of a specific revision. Returns a string.""" return _ReadGitilesContent(CHROMIUM_FILE_TEMPLATE % (revision, path_below_src)) def ReadRemoteCrCommit(revision): """Reads a remote Chromium commit message. Returns a string.""" return _ReadGitilesContent(CHROMIUM_COMMIT_TEMPLATE % revision) def ReadUrlContent(url): """Connect to a remote host and read the contents. Returns a list of lines.""" conn = urllib2.urlopen(url) try: return conn.readlines() except IOError as e: logging.exception('Error connecting to %s. Error: %s', url, e) raise finally: conn.close() def GetMatchingDepsEntries(depsentry_dict, dir_path): """Gets all deps entries matching the provided path. This list may contain more than one DepsEntry object. Example: dir_path='src/testing' would give results containing both 'src/testing/gtest' and 'src/testing/gmock' deps entries for Chromium's DEPS. Example 2: dir_path='src/build' should return 'src/build' but not 'src/buildtools'. Returns: A list of DepsEntry objects. """ result = [] for path, depsentry in depsentry_dict.iteritems(): if path == dir_path: result.append(depsentry) else: parts = path.split('/') if all(part == parts[i] for i, part in enumerate(dir_path.split('/'))): result.append(depsentry) return result def BuildDepsentryDict(deps_dict): """Builds a dict of paths to DepsEntry objects from a raw parsed deps dict.""" result = {} def AddDepsEntries(deps_subdict): for path, deps_url_spec in deps_subdict.iteritems(): # The deps url is either an URL and a condition, or just the URL. if isinstance(deps_url_spec, dict): if deps_url_spec.get('dep_type') == 'cipd': continue deps_url = deps_url_spec['url'] else: deps_url = deps_url_spec if not result.has_key(path): url, revision = deps_url.split('@') if deps_url else (None, None) result[path] = DepsEntry(path, url, revision) AddDepsEntries(deps_dict['deps']) for deps_os in ['win', 'mac', 'unix', 'android', 'ios', 'unix']: AddDepsEntries(deps_dict.get('deps_os', {}).get(deps_os, {})) return result def CalculateChangedDeps(libyuv_deps, new_cr_deps): """ Calculate changed deps entries based on entries defined in the libyuv DEPS file: - If a shared dependency with the Chromium DEPS file: roll it to the same revision as Chromium (i.e. entry in the new_cr_deps dict) - If it's a Chromium sub-directory, roll it to the HEAD revision (notice this means it may be ahead of the chromium_revision, but generally these should be close). - If it's another DEPS entry (not shared with Chromium), roll it to HEAD unless it's configured to be skipped. Returns: A list of ChangedDep objects representing the changed deps. """ result = [] libyuv_entries = BuildDepsentryDict(libyuv_deps) new_cr_entries = BuildDepsentryDict(new_cr_deps) for path, libyuv_deps_entry in libyuv_entries.iteritems(): if path in DONT_AUTOROLL_THESE: continue cr_deps_entry = new_cr_entries.get(path) if cr_deps_entry: # Use the revision from Chromium's DEPS file. new_rev = cr_deps_entry.revision assert libyuv_deps_entry.url == cr_deps_entry.url, ( 'Libyuv DEPS entry %s has a different URL (%s) than Chromium (%s).' % (path, libyuv_deps_entry.url, cr_deps_entry.url)) else: # Use the HEAD of the deps repo. stdout, _ = _RunCommand(['git', 'ls-remote', libyuv_deps_entry.url, 'HEAD']) new_rev = stdout.strip().split('\t')[0] # Check if an update is necessary. if libyuv_deps_entry.revision != new_rev: logging.debug('Roll dependency %s to %s', path, new_rev) result.append(ChangedDep(path, libyuv_deps_entry.url, libyuv_deps_entry.revision, new_rev)) return sorted(result) def CalculateChangedClang(new_cr_rev): def GetClangRev(lines): for line in lines: match = CLANG_REVISION_RE.match(line) if match: return match.group(1) raise RollError('Could not parse Clang revision from:\n' + '\n'.join(' ' + l for l in lines)) with open(CLANG_UPDATE_SCRIPT_LOCAL_PATH, 'rb') as f: current_lines = f.readlines() current_rev = GetClangRev(current_lines) new_clang_update_py = ReadRemoteCrFile(CLANG_UPDATE_SCRIPT_URL_PATH, new_cr_rev).splitlines() new_rev = GetClangRev(new_clang_update_py) return ChangedDep(CLANG_UPDATE_SCRIPT_LOCAL_PATH, None, current_rev, new_rev) def GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos, new_commit_pos, changed_deps_list, clang_change): current_cr_rev = current_cr_rev[0:10] new_cr_rev = new_cr_rev[0:10] rev_interval = '%s..%s' % (current_cr_rev, new_cr_rev) git_number_interval = '%s:%s' % (current_commit_pos, new_commit_pos) commit_msg = ['Roll chromium_revision %s (%s)\n' % (rev_interval, git_number_interval)] commit_msg.append('Change log: %s' % (CHROMIUM_LOG_TEMPLATE % rev_interval)) commit_msg.append('Full diff: %s\n' % (CHROMIUM_COMMIT_TEMPLATE % rev_interval)) if changed_deps_list: commit_msg.append('Changed dependencies:') for c in changed_deps_list: commit_msg.append('* %s: %s/+log/%s..%s' % (c.path, c.url, c.current_rev[0:10], c.new_rev[0:10])) change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, 'DEPS') commit_msg.append('DEPS diff: %s\n' % change_url) else: commit_msg.append('No dependencies changed.') if clang_change.current_rev != clang_change.new_rev: commit_msg.append('Clang version changed %s:%s' % (clang_change.current_rev, clang_change.new_rev)) change_url = CHROMIUM_FILE_TEMPLATE % (rev_interval, CLANG_UPDATE_SCRIPT_URL_PATH) commit_msg.append('Details: %s\n' % change_url) else: commit_msg.append('No update to Clang.\n') # TBR needs to be non-empty for Gerrit to process it. git_author = _RunCommand(['git', 'config', 'user.email'], working_dir=CHECKOUT_SRC_DIR)[0].strip() commit_msg.append('TBR=%s' % git_author) commit_msg.append('BUG=None') return '\n'.join(commit_msg) def UpdateDepsFile(deps_filename, old_cr_revision, new_cr_revision, changed_deps): """Update the DEPS file with the new revision.""" # Update the chromium_revision variable. with open(deps_filename, 'rb') as deps_file: deps_content = deps_file.read() deps_content = deps_content.replace(old_cr_revision, new_cr_revision) with open(deps_filename, 'wb') as deps_file: deps_file.write(deps_content) # Update each individual DEPS entry. for dep in changed_deps: local_dep_dir = os.path.join(CHECKOUT_ROOT_DIR, dep.path) if not os.path.isdir(local_dep_dir): raise RollError( 'Cannot find local directory %s. Make sure the .gclient file\n' 'contains all platforms in the target_os list, i.e.\n' 'target_os = ["android", "unix", "mac", "ios", "win"];\n' 'Then run "gclient sync" again.' % local_dep_dir) _RunCommand( ['gclient', 'setdep', '--revision', '%s@%s' % (dep.path, dep.new_rev)], working_dir=CHECKOUT_SRC_DIR) def _IsTreeClean(): stdout, _ = _RunCommand(['git', 'status', '--porcelain']) if len(stdout) == 0: return True logging.error('Dirty/unversioned files:\n%s', stdout) return False def _EnsureUpdatedMasterBranch(dry_run): current_branch = _RunCommand( ['git', 'rev-parse', '--abbrev-ref', 'HEAD'])[0].splitlines()[0] if current_branch != 'main': logging.error('Please checkout the main branch and re-run this script.') if not dry_run: sys.exit(-1) logging.info('Updating main branch...') _RunCommand(['git', 'pull']) def _CreateRollBranch(dry_run): logging.info('Creating roll branch: %s', ROLL_BRANCH_NAME) if not dry_run: _RunCommand(['git', 'checkout', '-b', ROLL_BRANCH_NAME]) def _RemovePreviousRollBranch(dry_run): active_branch, branches = _GetBranches() if active_branch == ROLL_BRANCH_NAME: active_branch = 'main' if ROLL_BRANCH_NAME in branches: logging.info('Removing previous roll branch (%s)', ROLL_BRANCH_NAME) if not dry_run: _RunCommand(['git', 'checkout', active_branch]) _RunCommand(['git', 'branch', '-D', ROLL_BRANCH_NAME]) def _LocalCommit(commit_msg, dry_run): logging.info('Committing changes locally.') if not dry_run: _RunCommand(['git', 'add', '--update', '.']) _RunCommand(['git', 'commit', '-m', commit_msg]) def ChooseCQMode(skip_cq, cq_over, current_commit_pos, new_commit_pos): if skip_cq: return 0 if (new_commit_pos - current_commit_pos) < cq_over: return 1 return 2 def _UploadCL(commit_queue_mode): """Upload the committed changes as a changelist to Gerrit. commit_queue_mode: - 2: Submit to commit queue. - 1: Run trybots but do not submit to CQ. - 0: Skip CQ, upload only. """ cmd = ['git', 'cl', 'upload', '--force', '--bypass-hooks', '--send-mail'] if commit_queue_mode >= 2: logging.info('Sending the CL to the CQ...') cmd.extend(['-o', 'label=Bot-Commit+1']) cmd.extend(['-o', 'label=Commit-Queue+2']) elif commit_queue_mode >= 1: logging.info('Starting CQ dry run...') cmd.extend(['-o', 'label=Commit-Queue+1']) extra_env = { 'EDITOR': 'true', 'SKIP_GCE_AUTH_FOR_GIT': '1', } stdout, stderr = _RunCommand(cmd, extra_env=extra_env) logging.debug('Output from "git cl upload":\nstdout:\n%s\n\nstderr:\n%s', stdout, stderr) def main(): p = argparse.ArgumentParser() p.add_argument('--clean', action='store_true', default=False, help='Removes any previous local roll branch.') p.add_argument('-r', '--revision', help=('Chromium Git revision to roll to. Defaults to the ' 'Chromium HEAD revision if omitted.')) p.add_argument('--dry-run', action='store_true', default=False, help=('Calculate changes and modify DEPS, but don\'t create ' 'any local branch, commit, upload CL or send any ' 'tryjobs.')) p.add_argument('-i', '--ignore-unclean-workdir', action='store_true', default=False, help=('Ignore if the current branch is not main or if there ' 'are uncommitted changes (default: %(default)s).')) grp = p.add_mutually_exclusive_group() grp.add_argument('--skip-cq', action='store_true', default=False, help='Skip sending the CL to the CQ (default: %(default)s)') grp.add_argument('--cq-over', type=int, default=1, help=('Commit queue dry run if the revision difference ' 'is below this number (default: %(default)s)')) p.add_argument('-v', '--verbose', action='store_true', default=False, help='Be extra verbose in printing of log messages.') opts = p.parse_args() if opts.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if not opts.ignore_unclean_workdir and not _IsTreeClean(): logging.error('Please clean your local checkout first.') return 1 if opts.clean: _RemovePreviousRollBranch(opts.dry_run) if not opts.ignore_unclean_workdir: _EnsureUpdatedMasterBranch(opts.dry_run) new_cr_rev = opts.revision if not new_cr_rev: stdout, _ = _RunCommand(['git', 'ls-remote', CHROMIUM_SRC_URL, 'HEAD']) head_rev = stdout.strip().split('\t')[0] logging.info('No revision specified. Using HEAD: %s', head_rev) new_cr_rev = head_rev deps_filename = os.path.join(CHECKOUT_SRC_DIR, 'DEPS') libyuv_deps = ParseLocalDepsFile(deps_filename) current_cr_rev = libyuv_deps['vars']['chromium_revision'] current_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(current_cr_rev)) new_commit_pos = ParseCommitPosition(ReadRemoteCrCommit(new_cr_rev)) new_cr_deps = ParseRemoteCrDepsFile(new_cr_rev) changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps) clang_change = CalculateChangedClang(new_cr_rev) commit_msg = GenerateCommitMessage(current_cr_rev, new_cr_rev, current_commit_pos, new_commit_pos, changed_deps, clang_change) logging.debug('Commit message:\n%s', commit_msg) _CreateRollBranch(opts.dry_run) UpdateDepsFile(deps_filename, current_cr_rev, new_cr_rev, changed_deps) _LocalCommit(commit_msg, opts.dry_run) commit_queue_mode = ChooseCQMode(opts.skip_cq, opts.cq_over, current_commit_pos, new_commit_pos) logging.info('Uploading CL...') if not opts.dry_run: _UploadCL(commit_queue_mode) return 0 if __name__ == '__main__': sys.exit(main()) libyuv-0.0~git20220104.b91df1a/tools_libyuv/autoroller/unittests/000077500000000000000000000000001416500237200245125ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/tools_libyuv/autoroller/unittests/roll_deps_test.py000077500000000000000000000126771416500237200301260ustar00rootroot00000000000000#!/usr/bin/env python # Copyright 2017 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. import glob import os import shutil import sys import tempfile import unittest SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) PARENT_DIR = os.path.join(SCRIPT_DIR, os.pardir) sys.path.append(PARENT_DIR) import roll_deps # pylint: disable=wrong-import-position from roll_deps import CalculateChangedDeps, GetMatchingDepsEntries, \ ParseDepsDict, ParseLocalDepsFile, \ UpdateDepsFile # pylint: disable=wrong-import-position TEST_DATA_VARS = { 'chromium_git': 'https://chromium.googlesource.com', 'chromium_revision': '1b9c098a08e40114e44b6c1ec33ddf95c40b901d', } DEPS_ENTRIES = { 'src/build': 'https://build.com', 'src/buildtools': 'https://buildtools.com', 'src/testing/gtest': 'https://gtest.com', 'src/testing/gmock': 'https://gmock.com', } BUILD_OLD_REV = '52f7afeca991d96d68cf0507e20dbdd5b845691f' BUILD_NEW_REV = 'HEAD' BUILDTOOLS_OLD_REV = '64e38f0cebdde27aa0cfb405f330063582f9ac76' BUILDTOOLS_NEW_REV = '55ad626b08ef971fd82a62b7abb325359542952b' class TestError(Exception): pass class FakeCmd(object): def __init__(self): self.expectations = [] def add_expectation(self, *args, **kwargs): returns = kwargs.pop('_returns', None) self.expectations.append((args, kwargs, returns)) def __call__(self, *args, **kwargs): if not self.expectations: raise TestError('Got unexpected\n%s\n%s' % (args, kwargs)) exp_args, exp_kwargs, exp_returns = self.expectations.pop(0) if args != exp_args or kwargs != exp_kwargs: message = 'Expected:\n args: %s\n kwargs: %s\n' % (exp_args, exp_kwargs) message += 'Got:\n args: %s\n kwargs: %s\n' % (args, kwargs) raise TestError(message) return exp_returns class TestRollChromiumRevision(unittest.TestCase): def setUp(self): self._output_dir = tempfile.mkdtemp() for test_file in glob.glob(os.path.join(SCRIPT_DIR, 'testdata', '*')): shutil.copy(test_file, self._output_dir) self._libyuv_depsfile = os.path.join(self._output_dir, 'DEPS') self._old_cr_depsfile = os.path.join(self._output_dir, 'DEPS.chromium.old') self._new_cr_depsfile = os.path.join(self._output_dir, 'DEPS.chromium.new') self.fake = FakeCmd() self.old_RunCommand = getattr(roll_deps, '_RunCommand') setattr(roll_deps, '_RunCommand', self.fake) def tearDown(self): shutil.rmtree(self._output_dir, ignore_errors=True) self.assertEqual(self.fake.expectations, []) setattr(roll_deps, '_RunCommand', self.old_RunCommand) def testVarLookup(self): local_scope = {'foo': 'wrong', 'vars': {'foo': 'bar'}} lookup = roll_deps.VarLookup(local_scope) self.assertEquals(lookup('foo'), 'bar') def testUpdateDepsFile(self): new_rev = 'aaaaabbbbbcccccdddddeeeeefffff0000011111' current_rev = TEST_DATA_VARS['chromium_revision'] UpdateDepsFile(self._libyuv_depsfile, current_rev, new_rev, []) with open(self._libyuv_depsfile) as deps_file: deps_contents = deps_file.read() self.assertTrue(new_rev in deps_contents, 'Failed to find %s in\n%s' % (new_rev, deps_contents)) def testParseDepsDict(self): with open(self._libyuv_depsfile) as deps_file: deps_contents = deps_file.read() local_scope = ParseDepsDict(deps_contents) vars_dict = local_scope['vars'] def assertVar(variable_name): self.assertEquals(vars_dict[variable_name], TEST_DATA_VARS[variable_name]) assertVar('chromium_git') assertVar('chromium_revision') self.assertEquals(len(local_scope['deps']), 3) def testGetMatchingDepsEntriesReturnsPathInSimpleCase(self): entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing/gtest') self.assertEquals(len(entries), 1) self.assertEquals(entries[0], DEPS_ENTRIES['src/testing/gtest']) def testGetMatchingDepsEntriesHandlesSimilarStartingPaths(self): entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/testing') self.assertEquals(len(entries), 2) def testGetMatchingDepsEntriesHandlesTwoPathsWithIdenticalFirstParts(self): entries = GetMatchingDepsEntries(DEPS_ENTRIES, 'src/build') self.assertEquals(len(entries), 1) self.assertEquals(entries[0], DEPS_ENTRIES['src/build']) def testCalculateChangedDeps(self): _SetupGitLsRemoteCall(self.fake, 'https://chromium.googlesource.com/chromium/src/build', BUILD_NEW_REV) libyuv_deps = ParseLocalDepsFile(self._libyuv_depsfile) new_cr_deps = ParseLocalDepsFile(self._new_cr_depsfile) changed_deps = CalculateChangedDeps(libyuv_deps, new_cr_deps) self.assertEquals(len(changed_deps), 2) self.assertEquals(changed_deps[0].path, 'src/build') self.assertEquals(changed_deps[0].current_rev, BUILD_OLD_REV) self.assertEquals(changed_deps[0].new_rev, BUILD_NEW_REV) self.assertEquals(changed_deps[1].path, 'src/buildtools') self.assertEquals(changed_deps[1].current_rev, BUILDTOOLS_OLD_REV) self.assertEquals(changed_deps[1].new_rev, BUILDTOOLS_NEW_REV) def _SetupGitLsRemoteCall(cmd_fake, url, revision): cmd = ['git', 'ls-remote', url, revision] cmd_fake.add_expectation(cmd, _returns=(revision, None)) if __name__ == '__main__': unittest.main() libyuv-0.0~git20220104.b91df1a/tools_libyuv/autoroller/unittests/testdata/000077500000000000000000000000001416500237200263235ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/tools_libyuv/autoroller/unittests/testdata/DEPS000066400000000000000000000014101416500237200267750ustar00rootroot00000000000000# DEPS file for unit tests. vars = { 'chromium_git': 'https://chromium.googlesource.com', 'chromium_revision': '1b9c098a08e40114e44b6c1ec33ddf95c40b901d', 'ignored_str': Str(''), } deps = { # Entry that is a directory in Chromium, so we're using a Git subtree mirror for it. 'src/build': Var('chromium_git') + '/chromium/src/build' + '@' + '52f7afeca991d96d68cf0507e20dbdd5b845691f', # Entry that's also a DEPS entry in the Chromium DEPS file. 'src/buildtools': Var('chromium_git') + '/chromium/buildtools.git' + '@' + '64e38f0cebdde27aa0cfb405f330063582f9ac76', # Entry only present in libyuv, not Chromium. 'src/third_party/gflags/src': Var('chromium_git') + '/external/github.com/gflags/gflags@03bebcb065c83beff83d50ae025a55a4bf94dfca', } libyuv-0.0~git20220104.b91df1a/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.new000066400000000000000000000005331416500237200314140ustar00rootroot00000000000000# DEPS file for unit tests. vars = { 'chromium_git': 'https://chromium.googlesource.com', # This is updated compared to the DEPS.chromium.old file. 'buildtools_revision': '55ad626b08ef971fd82a62b7abb325359542952b', } deps = { 'src/buildtools': Var('chromium_git') + '/chromium/buildtools.git' + '@' + Var('buildtools_revision'), } libyuv-0.0~git20220104.b91df1a/tools_libyuv/autoroller/unittests/testdata/DEPS.chromium.old000066400000000000000000000005331416500237200314010ustar00rootroot00000000000000# DEPS file for unit tests. vars = { 'chromium_git': 'https://chromium.googlesource.com', # This is and older revision than DEPS.chromium.new file. 'buildtools_revision': '64e38f0cebdde27aa0cfb405f330063582f9ac76', } deps = { 'src/buildtools': Var('chromium_git') + '/chromium/buildtools.git' + '@' + Var('buildtools_revision'), } libyuv-0.0~git20220104.b91df1a/tools_libyuv/get_landmines.py000077500000000000000000000023441416500237200234510ustar00rootroot00000000000000#!/usr/bin/env python3 # Copyright 2016 The LibYuv Project Authors. All rights reserved. # # Use of this source code is governed by a BSD-style license # that can be found in the LICENSE file in the root of the source # tree. An additional intellectual property rights grant can be found # in the file PATENTS. All contributing project authors may # be found in the AUTHORS file in the root of the source tree. """ This file emits the list of reasons why a particular build needs to be clobbered (or a list of 'landmines'). """ import sys def print_landmines(): """ ALL LANDMINES ARE EMITTED FROM HERE. """ # DO NOT add landmines as part of a regular CL. Landmines are a last-effort # bandaid fix if a CL that got landed has a build dependency bug and all bots # need to be cleaned up. If you're writing a new CL that causes build # dependency problems, fix the dependency problems instead of adding a # landmine. # See the Chromium version in src/build/get_landmines.py for usage examples. print('Clobber to remove GYP artifacts after switching bots to GN.') print('Another try to remove GYP artifacts after switching bots to GN.') def main(): print_landmines() return 0 if __name__ == '__main__': sys.exit(main()) libyuv-0.0~git20220104.b91df1a/tools_libyuv/msan/000077500000000000000000000000001416500237200212165ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/tools_libyuv/msan/OWNERS000066400000000000000000000000771416500237200221620ustar00rootroot00000000000000mbonadei@chromium.org fbarchard@chromium.org pbos@chromium.org libyuv-0.0~git20220104.b91df1a/tools_libyuv/msan/blacklist.txt000066400000000000000000000005741416500237200237350ustar00rootroot00000000000000# The rules in this file are only applied at compile time. # Because the Chrome buildsystem does not automatically touch the files # mentioned here, changing this file requires clobbering all MSan bots. # # Please think twice before you add or remove these rules. # This is a stripped down copy of Chromium's blacklist.txt, to enable # adding libyuv-specific blacklist entries. libyuv-0.0~git20220104.b91df1a/tools_libyuv/ubsan/000077500000000000000000000000001416500237200213705ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/tools_libyuv/ubsan/OWNERS000066400000000000000000000000771416500237200223340ustar00rootroot00000000000000mbonadei@chromium.org fbarchard@chromium.org pbos@chromium.org libyuv-0.0~git20220104.b91df1a/tools_libyuv/ubsan/blacklist.txt000066400000000000000000000010771416500237200241060ustar00rootroot00000000000000############################################################################# # UBSan blacklist. # Please think twice before you add or remove these rules. # This is a stripped down copy of Chromium's blacklist.txt, to enable # adding WebRTC-specific blacklist entries. ############################################################################# # YASM does some funny things that UBsan doesn't like. # https://crbug.com/489901 src:*/third_party/yasm/* ############################################################################# # Ignore system libraries. src:*/usr/* libyuv-0.0~git20220104.b91df1a/tools_libyuv/ubsan/vptr_blacklist.txt000066400000000000000000000021161416500237200251540ustar00rootroot00000000000000############################################################################# # UBSan vptr blacklist. # Function and type based blacklisting use a mangled name, and it is especially # tricky to represent C++ types. For now, any possible changes by name manglings # are simply represented as wildcard expressions of regexp, and thus it might be # over-blacklisted. # # Please think twice before you add or remove these rules. # # This is a stripped down copy of Chromium's vptr_blacklist.txt, to enable # adding libyuv-specific blacklist entries. ############################################################################# # Using raw pointer values. # # A raw pointer value (16) is used to infer the field offset by # GOOGLE_PROTOBUF_GENERATED_MESSAGE_FIELD_OFFSET. # Example: # src:*/third_party/protobuf/src/google/protobuf/compiler/plugin.pb.cc ############################################################################# # UBsan goes into an infinite recursion when __dynamic_cast instrumented with # "vptr". See crbug.com/609786. src:*/third_party/libc\+\+abi/trunk/src/private_typeinfo.cpp libyuv-0.0~git20220104.b91df1a/unit_test/000077500000000000000000000000001416500237200175445ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/unit_test/basictypes_test.cc000066400000000000000000000021521416500237200232600ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "../unit_test/unit_test.h" #include "libyuv/basic_types.h" namespace libyuv { TEST_F(LibYUVBaseTest, SizeOfTypes) { int8_t i8 = -1; uint8_t u8 = 1u; int16_t i16 = -1; uint16_t u16 = 1u; int32_t i32 = -1; uint32_t u32 = 1u; int64_t i64 = -1; uint64_t u64 = 1u; EXPECT_EQ(1u, sizeof(i8)); EXPECT_EQ(1u, sizeof(u8)); EXPECT_EQ(2u, sizeof(i16)); EXPECT_EQ(2u, sizeof(u16)); EXPECT_EQ(4u, sizeof(i32)); EXPECT_EQ(4u, sizeof(u32)); EXPECT_EQ(8u, sizeof(i64)); EXPECT_EQ(8u, sizeof(u64)); EXPECT_GT(0, i8); EXPECT_LT(0u, u8); EXPECT_GT(0, i16); EXPECT_LT(0u, u16); EXPECT_GT(0, i32); EXPECT_LT(0u, u32); EXPECT_GT(0, i64); EXPECT_LT(0u, u64); } } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/color_test.cc000066400000000000000000000740521416500237200222400ustar00rootroot00000000000000/* * Copyright 2015 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include "../unit_test/unit_test.h" #include "libyuv/basic_types.h" #include "libyuv/convert.h" #include "libyuv/convert_argb.h" #include "libyuv/convert_from.h" #include "libyuv/convert_from_argb.h" #include "libyuv/cpu_id.h" namespace libyuv { // TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB. // Port to Visual C and other CPUs #if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || defined(__i386__)) #define ERROR_FULL 5 #define ERROR_J420 4 #else #define ERROR_FULL 6 #define ERROR_J420 6 #endif #define ERROR_R 1 #define ERROR_G 1 #ifdef LIBYUV_UNLIMITED_DATA #define ERROR_B 1 #else #define ERROR_B 18 #endif #define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \ TEST_F(LibYUVColorTest, TESTNAME) { \ const int kPixels = benchmark_width_ * benchmark_height_; \ const int kHalfPixels = \ ((benchmark_width_ + 1) / 2) * ((benchmark_height_ + HS1) / HS); \ align_buffer_page_end(orig_y, kPixels); \ align_buffer_page_end(orig_u, kHalfPixels); \ align_buffer_page_end(orig_v, kHalfPixels); \ align_buffer_page_end(orig_pixels, kPixels * 4); \ align_buffer_page_end(temp_y, kPixels); \ align_buffer_page_end(temp_u, kHalfPixels); \ align_buffer_page_end(temp_v, kHalfPixels); \ align_buffer_page_end(dst_pixels_opt, kPixels * 4); \ align_buffer_page_end(dst_pixels_c, kPixels * 4); \ \ MemRandomize(orig_pixels, kPixels * 4); \ MemRandomize(orig_y, kPixels); \ MemRandomize(orig_u, kHalfPixels); \ MemRandomize(orig_v, kHalfPixels); \ MemRandomize(temp_y, kPixels); \ MemRandomize(temp_u, kHalfPixels); \ MemRandomize(temp_v, kHalfPixels); \ MemRandomize(dst_pixels_opt, kPixels * 4); \ MemRandomize(dst_pixels_c, kPixels * 4); \ \ /* The test is overall for color conversion matrix being reversible, so */ \ /* this initializes the pixel with 2x2 blocks to eliminate subsampling. */ \ uint8_t* p = orig_y; \ for (int y = 0; y < benchmark_height_ - HS1; y += HS) { \ for (int x = 0; x < benchmark_width_ - 1; x += 2) { \ uint8_t r = static_cast(fastrand()); \ p[0] = r; \ p[1] = r; \ p[HN] = r; \ p[HN + 1] = r; \ p += 2; \ } \ if (benchmark_width_ & 1) { \ uint8_t r = static_cast(fastrand()); \ p[0] = r; \ p[HN] = r; \ p += 1; \ } \ p += HN; \ } \ if ((benchmark_height_ & 1) && HS == 2) { \ for (int x = 0; x < benchmark_width_ - 1; x += 2) { \ uint8_t r = static_cast(fastrand()); \ p[0] = r; \ p[1] = r; \ p += 2; \ } \ if (benchmark_width_ & 1) { \ uint8_t r = static_cast(fastrand()); \ p[0] = r; \ p += 1; \ } \ } \ /* Start with YUV converted to ARGB. */ \ YUVTOARGB(orig_y, benchmark_width_, orig_u, (benchmark_width_ + 1) / 2, \ orig_v, (benchmark_width_ + 1) / 2, orig_pixels, \ benchmark_width_ * 4, benchmark_width_, benchmark_height_); \ \ ARGBTOYUV(orig_pixels, benchmark_width_ * 4, temp_y, benchmark_width_, \ temp_u, (benchmark_width_ + 1) / 2, temp_v, \ (benchmark_width_ + 1) / 2, benchmark_width_, \ benchmark_height_); \ \ MaskCpuFlags(disable_cpu_flags_); \ YUVTOARGB(temp_y, benchmark_width_, temp_u, (benchmark_width_ + 1) / 2, \ temp_v, (benchmark_width_ + 1) / 2, dst_pixels_c, \ benchmark_width_ * 4, benchmark_width_, benchmark_height_); \ MaskCpuFlags(benchmark_cpu_info_); \ \ for (int i = 0; i < benchmark_iterations_; ++i) { \ YUVTOARGB(temp_y, benchmark_width_, temp_u, (benchmark_width_ + 1) / 2, \ temp_v, (benchmark_width_ + 1) / 2, dst_pixels_opt, \ benchmark_width_ * 4, benchmark_width_, benchmark_height_); \ } \ /* Test C and SIMD match. */ \ for (int i = 0; i < kPixels * 4; ++i) { \ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ } \ /* Test SIMD is close to original. */ \ for (int i = 0; i < kPixels * 4; ++i) { \ EXPECT_NEAR(static_cast(orig_pixels[i]), \ static_cast(dst_pixels_opt[i]), DIFF); \ } \ \ free_aligned_buffer_page_end(orig_pixels); \ free_aligned_buffer_page_end(orig_y); \ free_aligned_buffer_page_end(orig_u); \ free_aligned_buffer_page_end(orig_v); \ free_aligned_buffer_page_end(temp_y); \ free_aligned_buffer_page_end(temp_u); \ free_aligned_buffer_page_end(temp_v); \ free_aligned_buffer_page_end(dst_pixels_opt); \ free_aligned_buffer_page_end(dst_pixels_c); \ } TESTCS(TestI420, I420ToARGB, ARGBToI420, 1, 2, benchmark_width_, ERROR_FULL) TESTCS(TestI422, I422ToARGB, ARGBToI422, 0, 1, 0, ERROR_FULL) TESTCS(TestJ420, J420ToARGB, ARGBToJ420, 1, 2, benchmark_width_, ERROR_J420) TESTCS(TestJ422, J422ToARGB, ARGBToJ422, 0, 1, 0, ERROR_J420) static void YUVToRGB(int y, int u, int v, int* r, int* g, int* b) { const int kWidth = 16; const int kHeight = 1; const int kPixels = kWidth * kHeight; const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2); SIMD_ALIGNED(uint8_t orig_y[16]); SIMD_ALIGNED(uint8_t orig_u[8]); SIMD_ALIGNED(uint8_t orig_v[8]); SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]); memset(orig_y, y, kPixels); memset(orig_u, u, kHalfPixels); memset(orig_v, v, kHalfPixels); /* YUV converted to ARGB. */ I422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2, orig_pixels, kWidth * 4, kWidth, kHeight); *b = orig_pixels[0]; *g = orig_pixels[1]; *r = orig_pixels[2]; } static void YUVJToRGB(int y, int u, int v, int* r, int* g, int* b) { const int kWidth = 16; const int kHeight = 1; const int kPixels = kWidth * kHeight; const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2); SIMD_ALIGNED(uint8_t orig_y[16]); SIMD_ALIGNED(uint8_t orig_u[8]); SIMD_ALIGNED(uint8_t orig_v[8]); SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]); memset(orig_y, y, kPixels); memset(orig_u, u, kHalfPixels); memset(orig_v, v, kHalfPixels); /* YUV converted to ARGB. */ J422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2, orig_pixels, kWidth * 4, kWidth, kHeight); *b = orig_pixels[0]; *g = orig_pixels[1]; *r = orig_pixels[2]; } static void YUVHToRGB(int y, int u, int v, int* r, int* g, int* b) { const int kWidth = 16; const int kHeight = 1; const int kPixels = kWidth * kHeight; const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2); SIMD_ALIGNED(uint8_t orig_y[16]); SIMD_ALIGNED(uint8_t orig_u[8]); SIMD_ALIGNED(uint8_t orig_v[8]); SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]); memset(orig_y, y, kPixels); memset(orig_u, u, kHalfPixels); memset(orig_v, v, kHalfPixels); /* YUV converted to ARGB. */ H422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2, orig_pixels, kWidth * 4, kWidth, kHeight); *b = orig_pixels[0]; *g = orig_pixels[1]; *r = orig_pixels[2]; } #define F422ToARGB(a, b, c, d, e, f, g, h, i, j) \ I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j) static void YUVFToRGB(int y, int u, int v, int* r, int* g, int* b) { const int kWidth = 16; const int kHeight = 1; const int kPixels = kWidth * kHeight; const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2); SIMD_ALIGNED(uint8_t orig_y[16]); SIMD_ALIGNED(uint8_t orig_u[8]); SIMD_ALIGNED(uint8_t orig_v[8]); SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]); memset(orig_y, y, kPixels); memset(orig_u, u, kHalfPixels); memset(orig_v, v, kHalfPixels); /* YUV converted to ARGB. */ F422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2, orig_pixels, kWidth * 4, kWidth, kHeight); *b = orig_pixels[0]; *g = orig_pixels[1]; *r = orig_pixels[2]; } static void YUVUToRGB(int y, int u, int v, int* r, int* g, int* b) { const int kWidth = 16; const int kHeight = 1; const int kPixels = kWidth * kHeight; const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2); SIMD_ALIGNED(uint8_t orig_y[16]); SIMD_ALIGNED(uint8_t orig_u[8]); SIMD_ALIGNED(uint8_t orig_v[8]); SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]); memset(orig_y, y, kPixels); memset(orig_u, u, kHalfPixels); memset(orig_v, v, kHalfPixels); /* YUV converted to ARGB. */ U422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2, orig_pixels, kWidth * 4, kWidth, kHeight); *b = orig_pixels[0]; *g = orig_pixels[1]; *r = orig_pixels[2]; } #define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \ I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j) static void YUVVToRGB(int y, int u, int v, int* r, int* g, int* b) { const int kWidth = 16; const int kHeight = 1; const int kPixels = kWidth * kHeight; const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2); SIMD_ALIGNED(uint8_t orig_y[16]); SIMD_ALIGNED(uint8_t orig_u[8]); SIMD_ALIGNED(uint8_t orig_v[8]); SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]); memset(orig_y, y, kPixels); memset(orig_u, u, kHalfPixels); memset(orig_v, v, kHalfPixels); /* YUV converted to ARGB. */ V422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2, orig_pixels, kWidth * 4, kWidth, kHeight); *b = orig_pixels[0]; *g = orig_pixels[1]; *r = orig_pixels[2]; } static void YToRGB(int y, int* r, int* g, int* b) { const int kWidth = 16; const int kHeight = 1; const int kPixels = kWidth * kHeight; SIMD_ALIGNED(uint8_t orig_y[16]); SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]); memset(orig_y, y, kPixels); /* YUV converted to ARGB. */ I400ToARGB(orig_y, kWidth, orig_pixels, kWidth * 4, kWidth, kHeight); *b = orig_pixels[0]; *g = orig_pixels[1]; *r = orig_pixels[2]; } static void YJToRGB(int y, int* r, int* g, int* b) { const int kWidth = 16; const int kHeight = 1; const int kPixels = kWidth * kHeight; SIMD_ALIGNED(uint8_t orig_y[16]); SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]); memset(orig_y, y, kPixels); /* YUV converted to ARGB. */ J400ToARGB(orig_y, kWidth, orig_pixels, kWidth * 4, kWidth, kHeight); *b = orig_pixels[0]; *g = orig_pixels[1]; *r = orig_pixels[2]; } // Pick a method for clamping. // #define CLAMPMETHOD_IF 1 // #define CLAMPMETHOD_TABLE 1 #define CLAMPMETHOD_TERNARY 1 // #define CLAMPMETHOD_MASK 1 // Pick a method for rounding. #define ROUND(f) static_cast(f + 0.5f) // #define ROUND(f) lrintf(f) // #define ROUND(f) static_cast(round(f)) // #define ROUND(f) _mm_cvt_ss2si(_mm_load_ss(&f)) #if defined(CLAMPMETHOD_IF) static int RoundToByte(float f) { int i = ROUND(f); if (i < 0) { i = 0; } if (i > 255) { i = 255; } return i; } #elif defined(CLAMPMETHOD_TABLE) static const unsigned char clamptable[811] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255}; static int RoundToByte(float f) { return clamptable[ROUND(f) + 276]; } #elif defined(CLAMPMETHOD_TERNARY) static int RoundToByte(float f) { int i = ROUND(f); return (i < 0) ? 0 : ((i > 255) ? 255 : i); } #elif defined(CLAMPMETHOD_MASK) static int RoundToByte(float f) { int i = ROUND(f); i = ((-(i) >> 31) & (i)); // clamp to 0. return (((255 - (i)) >> 31) | (i)) & 255; // clamp to 255. } #endif #define RANDOM256(s) ((s & 1) ? ((s >> 1) ^ 0xb8) : (s >> 1)) TEST_F(LibYUVColorTest, TestRoundToByte) { int allb = 0; int count = benchmark_width_ * benchmark_height_; for (int i = 0; i < benchmark_iterations_; ++i) { float f = (fastrand() & 255) * 3.14f - 260.f; for (int j = 0; j < count; ++j) { int b = RoundToByte(f); f += 0.91f; allb |= b; } } EXPECT_GE(allb, 0); EXPECT_LE(allb, 255); } // BT.601 limited range YUV to RGB reference static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) { *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596); *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813); *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018); } // BT.601 full range YUV to RGB reference (aka JPEG) static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) { *r = RoundToByte(y - (v - 128) * -1.40200); *g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414); *b = RoundToByte(y - (u - 128) * -1.77200); } // BT.709 limited range YUV to RGB reference // See also http://www.equasys.de/colorconversion.html static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) { *r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793); *g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.213 - (v - 128) * 0.533); *b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.112); } // BT.709 full range YUV to RGB reference static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) { *r = RoundToByte(y - (v - 128) * -1.5748); *g = RoundToByte(y - (u - 128) * 0.18732 - (v - 128) * 0.46812); *b = RoundToByte(y - (u - 128) * -1.8556); } // BT.2020 limited range YUV to RGB reference static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) { *r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867); *g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 - (v - 128) * 0.65042); *b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177); } // BT.2020 full range YUV to RGB reference static void YUVVToRGBReference(int y, int u, int v, int* r, int* g, int* b) { *r = RoundToByte(y + (v - 128) * 1.474600); *g = RoundToByte(y - (u - 128) * 0.164553 - (v - 128) * 0.571353); *b = RoundToByte(y + (u - 128) * 1.881400); } TEST_F(LibYUVColorTest, TestYUV) { int r0, g0, b0, r1, g1, b1; // cyan (less red) YUVToRGBReference(240, 255, 0, &r0, &g0, &b0); EXPECT_EQ(56, r0); EXPECT_EQ(255, g0); EXPECT_EQ(255, b0); YUVToRGB(240, 255, 0, &r1, &g1, &b1); EXPECT_EQ(57, r1); EXPECT_EQ(255, g1); EXPECT_EQ(255, b1); // green (less red and blue) YUVToRGBReference(240, 0, 0, &r0, &g0, &b0); EXPECT_EQ(56, r0); EXPECT_EQ(255, g0); EXPECT_EQ(2, b0); YUVToRGB(240, 0, 0, &r1, &g1, &b1); EXPECT_EQ(57, r1); EXPECT_EQ(255, g1); #ifdef LIBYUV_UNLIMITED_DATA EXPECT_EQ(3, b1); #else EXPECT_EQ(5, b1); #endif for (int i = 0; i < 256; ++i) { YUVToRGBReference(i, 128, 128, &r0, &g0, &b0); YUVToRGB(i, 128, 128, &r1, &g1, &b1); EXPECT_NEAR(r0, r1, ERROR_R); EXPECT_NEAR(g0, g1, ERROR_G); EXPECT_NEAR(b0, b1, ERROR_B); YUVToRGBReference(i, 0, 0, &r0, &g0, &b0); YUVToRGB(i, 0, 0, &r1, &g1, &b1); EXPECT_NEAR(r0, r1, ERROR_R); EXPECT_NEAR(g0, g1, ERROR_G); EXPECT_NEAR(b0, b1, ERROR_B); YUVToRGBReference(i, 0, 255, &r0, &g0, &b0); YUVToRGB(i, 0, 255, &r1, &g1, &b1); EXPECT_NEAR(r0, r1, ERROR_R); EXPECT_NEAR(g0, g1, ERROR_G); EXPECT_NEAR(b0, b1, ERROR_B); } } TEST_F(LibYUVColorTest, TestGreyYUV) { int r0, g0, b0, r1, g1, b1, r2, g2, b2; // black YUVToRGBReference(16, 128, 128, &r0, &g0, &b0); EXPECT_EQ(0, r0); EXPECT_EQ(0, g0); EXPECT_EQ(0, b0); YUVToRGB(16, 128, 128, &r1, &g1, &b1); EXPECT_EQ(0, r1); EXPECT_EQ(0, g1); EXPECT_EQ(0, b1); // white YUVToRGBReference(240, 128, 128, &r0, &g0, &b0); EXPECT_EQ(255, r0); EXPECT_EQ(255, g0); EXPECT_EQ(255, b0); YUVToRGB(240, 128, 128, &r1, &g1, &b1); EXPECT_EQ(255, r1); EXPECT_EQ(255, g1); EXPECT_EQ(255, b1); // grey YUVToRGBReference(128, 128, 128, &r0, &g0, &b0); EXPECT_EQ(130, r0); EXPECT_EQ(130, g0); EXPECT_EQ(130, b0); YUVToRGB(128, 128, 128, &r1, &g1, &b1); EXPECT_EQ(130, r1); EXPECT_EQ(130, g1); EXPECT_EQ(130, b1); for (int y = 0; y < 256; ++y) { YUVToRGBReference(y, 128, 128, &r0, &g0, &b0); YUVToRGB(y, 128, 128, &r1, &g1, &b1); YToRGB(y, &r2, &g2, &b2); EXPECT_EQ(r0, r1); EXPECT_EQ(g0, g1); EXPECT_EQ(b0, b1); EXPECT_EQ(r0, r2); EXPECT_EQ(g0, g2); EXPECT_EQ(b0, b2); } } static void PrintHistogram(int rh[256], int gh[256], int bh[256]) { int i; printf("hist"); for (i = 0; i < 256; ++i) { if (rh[i] || gh[i] || bh[i]) { printf("\t%8d", i - 128); } } printf("\nred"); for (i = 0; i < 256; ++i) { if (rh[i] || gh[i] || bh[i]) { printf("\t%8d", rh[i]); } } printf("\ngreen"); for (i = 0; i < 256; ++i) { if (rh[i] || gh[i] || bh[i]) { printf("\t%8d", gh[i]); } } printf("\nblue"); for (i = 0; i < 256; ++i) { if (rh[i] || gh[i] || bh[i]) { printf("\t%8d", bh[i]); } } printf("\n"); } // Step by 5 on inner loop goes from 0 to 255 inclusive. // Set to 1 for better converage. 3, 5 or 17 for faster testing. #ifdef DISABLE_SLOW_TESTS #define FASTSTEP 5 #else #define FASTSTEP 1 #endif // BT.601 limited range. TEST_F(LibYUVColorTest, TestFullYUV) { int rh[256] = { 0, }; int gh[256] = { 0, }; int bh[256] = { 0, }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { int r0, g0, b0, r1, g1, b1; int y = RANDOM256(y2); YUVToRGBReference(y, u, v, &r0, &g0, &b0); YUVToRGB(y, u, v, &r1, &g1, &b1); EXPECT_NEAR(r0, r1, ERROR_R); EXPECT_NEAR(g0, g1, ERROR_G); EXPECT_NEAR(b0, b1, ERROR_B); ++rh[r1 - r0 + 128]; ++gh[g1 - g0 + 128]; ++bh[b1 - b0 + 128]; } } } PrintHistogram(rh, gh, bh); } // BT.601 full range. TEST_F(LibYUVColorTest, TestFullYUVJ) { int rh[256] = { 0, }; int gh[256] = { 0, }; int bh[256] = { 0, }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { int r0, g0, b0, r1, g1, b1; int y = RANDOM256(y2); YUVJToRGBReference(y, u, v, &r0, &g0, &b0); YUVJToRGB(y, u, v, &r1, &g1, &b1); EXPECT_NEAR(r0, r1, ERROR_R); EXPECT_NEAR(g0, g1, ERROR_G); EXPECT_NEAR(b0, b1, ERROR_B); ++rh[r1 - r0 + 128]; ++gh[g1 - g0 + 128]; ++bh[b1 - b0 + 128]; } } } PrintHistogram(rh, gh, bh); } // BT.709 limited range. TEST_F(LibYUVColorTest, TestFullYUVH) { int rh[256] = { 0, }; int gh[256] = { 0, }; int bh[256] = { 0, }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { int r0, g0, b0, r1, g1, b1; int y = RANDOM256(y2); YUVHToRGBReference(y, u, v, &r0, &g0, &b0); YUVHToRGB(y, u, v, &r1, &g1, &b1); EXPECT_NEAR(r0, r1, ERROR_R); EXPECT_NEAR(g0, g1, ERROR_G); EXPECT_NEAR(b0, b1, ERROR_B); ++rh[r1 - r0 + 128]; ++gh[g1 - g0 + 128]; ++bh[b1 - b0 + 128]; } } } PrintHistogram(rh, gh, bh); } // BT.709 full range. TEST_F(LibYUVColorTest, TestFullYUVF) { int rh[256] = { 0, }; int gh[256] = { 0, }; int bh[256] = { 0, }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { int r0, g0, b0, r1, g1, b1; int y = RANDOM256(y2); YUVFToRGBReference(y, u, v, &r0, &g0, &b0); YUVFToRGB(y, u, v, &r1, &g1, &b1); EXPECT_NEAR(r0, r1, ERROR_R); EXPECT_NEAR(g0, g1, ERROR_G); EXPECT_NEAR(b0, b1, ERROR_B); ++rh[r1 - r0 + 128]; ++gh[g1 - g0 + 128]; ++bh[b1 - b0 + 128]; } } } PrintHistogram(rh, gh, bh); } // BT.2020 limited range. TEST_F(LibYUVColorTest, TestFullYUVU) { int rh[256] = { 0, }; int gh[256] = { 0, }; int bh[256] = { 0, }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { int r0, g0, b0, r1, g1, b1; int y = RANDOM256(y2); YUVUToRGBReference(y, u, v, &r0, &g0, &b0); YUVUToRGB(y, u, v, &r1, &g1, &b1); EXPECT_NEAR(r0, r1, ERROR_R); EXPECT_NEAR(g0, g1, ERROR_G); EXPECT_NEAR(b0, b1, ERROR_B); ++rh[r1 - r0 + 128]; ++gh[g1 - g0 + 128]; ++bh[b1 - b0 + 128]; } } } PrintHistogram(rh, gh, bh); } // BT.2020 full range. TEST_F(LibYUVColorTest, TestFullYUVV) { int rh[256] = { 0, }; int gh[256] = { 0, }; int bh[256] = { 0, }; for (int u = 0; u < 256; ++u) { for (int v = 0; v < 256; ++v) { for (int y2 = 0; y2 < 256; y2 += FASTSTEP) { int r0, g0, b0, r1, g1, b1; int y = RANDOM256(y2); YUVVToRGBReference(y, u, v, &r0, &g0, &b0); YUVVToRGB(y, u, v, &r1, &g1, &b1); EXPECT_NEAR(r0, r1, ERROR_R); EXPECT_NEAR(g0, g1, 2); EXPECT_NEAR(b0, b1, ERROR_B); ++rh[r1 - r0 + 128]; ++gh[g1 - g0 + 128]; ++bh[b1 - b0 + 128]; } } } PrintHistogram(rh, gh, bh); } #undef FASTSTEP TEST_F(LibYUVColorTest, TestGreyYUVJ) { int r0, g0, b0, r1, g1, b1, r2, g2, b2; // black YUVJToRGBReference(0, 128, 128, &r0, &g0, &b0); EXPECT_EQ(0, r0); EXPECT_EQ(0, g0); EXPECT_EQ(0, b0); YUVJToRGB(0, 128, 128, &r1, &g1, &b1); EXPECT_EQ(0, r1); EXPECT_EQ(0, g1); EXPECT_EQ(0, b1); // white YUVJToRGBReference(255, 128, 128, &r0, &g0, &b0); EXPECT_EQ(255, r0); EXPECT_EQ(255, g0); EXPECT_EQ(255, b0); YUVJToRGB(255, 128, 128, &r1, &g1, &b1); EXPECT_EQ(255, r1); EXPECT_EQ(255, g1); EXPECT_EQ(255, b1); // grey YUVJToRGBReference(128, 128, 128, &r0, &g0, &b0); EXPECT_EQ(128, r0); EXPECT_EQ(128, g0); EXPECT_EQ(128, b0); YUVJToRGB(128, 128, 128, &r1, &g1, &b1); EXPECT_EQ(128, r1); EXPECT_EQ(128, g1); EXPECT_EQ(128, b1); for (int y = 0; y < 256; ++y) { YUVJToRGBReference(y, 128, 128, &r0, &g0, &b0); YUVJToRGB(y, 128, 128, &r1, &g1, &b1); YJToRGB(y, &r2, &g2, &b2); EXPECT_EQ(r0, r1); EXPECT_EQ(g0, g1); EXPECT_EQ(b0, b1); EXPECT_EQ(r0, r2); EXPECT_EQ(g0, g2); EXPECT_EQ(b0, b2); } } } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/compare_test.cc000066400000000000000000000544311416500237200225470ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include #include "../unit_test/unit_test.h" #include "libyuv/basic_types.h" #include "libyuv/compare.h" #include "libyuv/cpu_id.h" #include "libyuv/video_common.h" #ifdef ENABLE_ROW_TESTS #include "libyuv/compare_row.h" /* For HammingDistance_C */ #endif namespace libyuv { // hash seed of 5381 recommended. static uint32_t ReferenceHashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) { uint32_t hash = seed; if (count > 0) { do { hash = hash * 33 + *src++; } while (--count); } return hash; } TEST_F(LibYUVCompareTest, Djb2_Test) { const int kMaxTest = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_a, kMaxTest); align_buffer_page_end(src_b, kMaxTest); const char* fox = "The quick brown fox jumps over the lazy dog" " and feels as if he were in the seventh heaven of typography" " together with Hermann Zapf"; uint32_t foxhash = HashDjb2(reinterpret_cast(fox), 131, 5381); const uint32_t kExpectedFoxHash = 2611006483u; EXPECT_EQ(kExpectedFoxHash, foxhash); for (int i = 0; i < kMaxTest; ++i) { src_a[i] = (fastrand() & 0xff); src_b[i] = (fastrand() & 0xff); } // Compare different buffers. Expect hash is different. uint32_t h1 = HashDjb2(src_a, kMaxTest, 5381); uint32_t h2 = HashDjb2(src_b, kMaxTest, 5381); EXPECT_NE(h1, h2); // Make last half same. Expect hash is different. memcpy(src_a + kMaxTest / 2, src_b + kMaxTest / 2, kMaxTest / 2); h1 = HashDjb2(src_a, kMaxTest, 5381); h2 = HashDjb2(src_b, kMaxTest, 5381); EXPECT_NE(h1, h2); // Make first half same. Expect hash is different. memcpy(src_a + kMaxTest / 2, src_a, kMaxTest / 2); memcpy(src_b + kMaxTest / 2, src_b, kMaxTest / 2); memcpy(src_a, src_b, kMaxTest / 2); h1 = HashDjb2(src_a, kMaxTest, 5381); h2 = HashDjb2(src_b, kMaxTest, 5381); EXPECT_NE(h1, h2); // Make same. Expect hash is same. memcpy(src_a, src_b, kMaxTest); h1 = HashDjb2(src_a, kMaxTest, 5381); h2 = HashDjb2(src_b, kMaxTest, 5381); EXPECT_EQ(h1, h2); // Mask seed different. Expect hash is different. memcpy(src_a, src_b, kMaxTest); h1 = HashDjb2(src_a, kMaxTest, 5381); h2 = HashDjb2(src_b, kMaxTest, 1234); EXPECT_NE(h1, h2); // Make one byte different in middle. Expect hash is different. memcpy(src_a, src_b, kMaxTest); ++src_b[kMaxTest / 2]; h1 = HashDjb2(src_a, kMaxTest, 5381); h2 = HashDjb2(src_b, kMaxTest, 5381); EXPECT_NE(h1, h2); // Make first byte different. Expect hash is different. memcpy(src_a, src_b, kMaxTest); ++src_b[0]; h1 = HashDjb2(src_a, kMaxTest, 5381); h2 = HashDjb2(src_b, kMaxTest, 5381); EXPECT_NE(h1, h2); // Make last byte different. Expect hash is different. memcpy(src_a, src_b, kMaxTest); ++src_b[kMaxTest - 1]; h1 = HashDjb2(src_a, kMaxTest, 5381); h2 = HashDjb2(src_b, kMaxTest, 5381); EXPECT_NE(h1, h2); // Make a zeros. Test different lengths. Expect hash is different. memset(src_a, 0, kMaxTest); h1 = HashDjb2(src_a, kMaxTest, 5381); h2 = HashDjb2(src_a, kMaxTest / 2, 5381); EXPECT_NE(h1, h2); // Make a zeros and seed of zero. Test different lengths. Expect hash is same. memset(src_a, 0, kMaxTest); h1 = HashDjb2(src_a, kMaxTest, 0); h2 = HashDjb2(src_a, kMaxTest / 2, 0); EXPECT_EQ(h1, h2); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } TEST_F(LibYUVCompareTest, BenchmarkDjb2_Opt) { const int kMaxTest = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_a, kMaxTest); for (int i = 0; i < kMaxTest; ++i) { src_a[i] = i; } uint32_t h2 = ReferenceHashDjb2(src_a, kMaxTest, 5381); uint32_t h1; for (int i = 0; i < benchmark_iterations_; ++i) { h1 = HashDjb2(src_a, kMaxTest, 5381); } EXPECT_EQ(h1, h2); free_aligned_buffer_page_end(src_a); } TEST_F(LibYUVCompareTest, BenchmarkDjb2_Unaligned) { const int kMaxTest = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_a, kMaxTest + 1); for (int i = 0; i < kMaxTest; ++i) { src_a[i + 1] = i; } uint32_t h2 = ReferenceHashDjb2(src_a + 1, kMaxTest, 5381); uint32_t h1; for (int i = 0; i < benchmark_iterations_; ++i) { h1 = HashDjb2(src_a + 1, kMaxTest, 5381); } EXPECT_EQ(h1, h2); free_aligned_buffer_page_end(src_a); } TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Opt) { uint32_t fourcc; const int kMaxTest = benchmark_width_ * benchmark_height_ * 4; align_buffer_page_end(src_a, kMaxTest); for (int i = 0; i < kMaxTest; ++i) { src_a[i] = 255; } src_a[0] = 0; fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_, benchmark_height_); EXPECT_EQ(static_cast(libyuv::FOURCC_BGRA), fourcc); src_a[0] = 255; src_a[3] = 0; fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_, benchmark_height_); EXPECT_EQ(static_cast(libyuv::FOURCC_ARGB), fourcc); src_a[3] = 255; for (int i = 0; i < benchmark_iterations_; ++i) { fourcc = ARGBDetect(src_a, benchmark_width_ * 4, benchmark_width_, benchmark_height_); } EXPECT_EQ(0u, fourcc); free_aligned_buffer_page_end(src_a); } TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Unaligned) { uint32_t fourcc; const int kMaxTest = benchmark_width_ * benchmark_height_ * 4 + 1; align_buffer_page_end(src_a, kMaxTest); for (int i = 1; i < kMaxTest; ++i) { src_a[i] = 255; } src_a[0 + 1] = 0; fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_, benchmark_height_); EXPECT_EQ(static_cast(libyuv::FOURCC_BGRA), fourcc); src_a[0 + 1] = 255; src_a[3 + 1] = 0; fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_, benchmark_height_); EXPECT_EQ(static_cast(libyuv::FOURCC_ARGB), fourcc); src_a[3 + 1] = 255; for (int i = 0; i < benchmark_iterations_; ++i) { fourcc = ARGBDetect(src_a + 1, benchmark_width_ * 4, benchmark_width_, benchmark_height_); } EXPECT_EQ(0u, fourcc); free_aligned_buffer_page_end(src_a); } #ifdef ENABLE_ROW_TESTS TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) { const int kMaxWidth = 4096 * 3; align_buffer_page_end(src_a, kMaxWidth); align_buffer_page_end(src_b, kMaxWidth); memset(src_a, 0, kMaxWidth); memset(src_b, 0, kMaxWidth); // Test known value memcpy(src_a, "test0123test4567", 16); memcpy(src_b, "tick0123tock4567", 16); uint32_t h1 = HammingDistance_C(src_a, src_b, 16); EXPECT_EQ(16u, h1); // Test C vs OPT on random buffer MemRandomize(src_a, kMaxWidth); MemRandomize(src_b, kMaxWidth); uint32_t h0 = HammingDistance_C(src_a, src_b, kMaxWidth); int count = benchmark_iterations_ * ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth); for (int i = 0; i < count; ++i) { #if defined(HAS_HAMMINGDISTANCE_NEON) h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth); #elif defined(HAS_HAMMINGDISTANCE_AVX2) int has_avx2 = TestCpuFlag(kCpuHasAVX2); if (has_avx2) { h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); } else { int has_sse42 = TestCpuFlag(kCpuHasSSE42); if (has_sse42) { h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); if (has_ssse3) { h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); } } } #elif defined(HAS_HAMMINGDISTANCE_SSE42) int has_sse42 = TestCpuFlag(kCpuHasSSE42); if (has_sse42) { h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); } #else h1 = HammingDistance_C(src_a, src_b, kMaxWidth); #endif } EXPECT_EQ(h0, h1); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_C) { const int kMaxWidth = 4096 * 3; align_buffer_page_end(src_a, kMaxWidth); align_buffer_page_end(src_b, kMaxWidth); memset(src_a, 0, kMaxWidth); memset(src_b, 0, kMaxWidth); // Test known value memcpy(src_a, "test0123test4567", 16); memcpy(src_b, "tick0123tock4567", 16); uint32_t h1 = HammingDistance_C(src_a, src_b, 16); EXPECT_EQ(16u, h1); // Test C vs OPT on random buffer MemRandomize(src_a, kMaxWidth); MemRandomize(src_b, kMaxWidth); uint32_t h0 = HammingDistance_C(src_a, src_b, kMaxWidth); int count = benchmark_iterations_ * ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth); for (int i = 0; i < count; ++i) { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); } EXPECT_EQ(h0, h1); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } TEST_F(LibYUVCompareTest, BenchmarkHammingDistance) { const int kMaxWidth = 4096 * 3; align_buffer_page_end(src_a, kMaxWidth); align_buffer_page_end(src_b, kMaxWidth); memset(src_a, 0, kMaxWidth); memset(src_b, 0, kMaxWidth); memcpy(src_a, "test0123test4567", 16); memcpy(src_b, "tick0123tock4567", 16); uint64_t h1 = ComputeHammingDistance(src_a, src_b, 16); EXPECT_EQ(16u, h1); // Test C vs OPT on random buffer MemRandomize(src_a, kMaxWidth); MemRandomize(src_b, kMaxWidth); uint32_t h0 = HammingDistance_C(src_a, src_b, kMaxWidth); int count = benchmark_iterations_ * ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth); for (int i = 0; i < count; ++i) { h1 = ComputeHammingDistance(src_a, src_b, kMaxWidth); } EXPECT_EQ(h0, h1); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } // Tests low levels match reference C for specified size. // The opt implementations have size limitations // For NEON the counters are 16 bit so the shorts overflow after 65536 bytes. // So doing one less iteration of the loop is the maximum. #if defined(HAS_HAMMINGDISTANCE_NEON) static const int kMaxOptCount = 65536 - 32; // 65504 #else static const int kMaxOptCount = (1 << (32 - 3)) - 64; // 536870848 #endif TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) { uint32_t h1 = 0; const int kMaxWidth = (benchmark_width_ * benchmark_height_ + 63) & ~63; align_buffer_page_end(src_a, kMaxWidth); align_buffer_page_end(src_b, kMaxWidth); memset(src_a, 255u, kMaxWidth); memset(src_b, 0u, kMaxWidth); uint64_t h0 = ComputeHammingDistance(src_a, src_b, kMaxWidth); EXPECT_EQ(kMaxWidth * 8ULL, h0); for (int i = 0; i < benchmark_iterations_; ++i) { #if defined(HAS_HAMMINGDISTANCE_NEON) h1 = HammingDistance_NEON(src_a, src_b, kMaxWidth); #elif defined(HAS_HAMMINGDISTANCE_AVX2) int has_avx2 = TestCpuFlag(kCpuHasAVX2); if (has_avx2) { h1 = HammingDistance_AVX2(src_a, src_b, kMaxWidth); } else { int has_sse42 = TestCpuFlag(kCpuHasSSE42); if (has_sse42) { h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); if (has_ssse3) { h1 = HammingDistance_SSSE3(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); } } } #elif defined(HAS_HAMMINGDISTANCE_SSE42) int has_sse42 = TestCpuFlag(kCpuHasSSE42); if (has_sse42) { h1 = HammingDistance_SSE42(src_a, src_b, kMaxWidth); } else { h1 = HammingDistance_C(src_a, src_b, kMaxWidth); } #else h1 = HammingDistance_C(src_a, src_b, kMaxWidth); #endif } // A large count will cause the low level to potentially overflow so the // result can not be expected to be correct. // TODO(fbarchard): Consider expecting the low 16 bits to match. if (kMaxWidth <= kMaxOptCount) { EXPECT_EQ(kMaxWidth * 8U, h1); } else { if (kMaxWidth * 8ULL != static_cast(h1)) { printf( "warning - HammingDistance_Opt %u does not match %llu " "but length of %u is longer than guaranteed.\n", h1, kMaxWidth * 8ULL, kMaxWidth); } else { printf( "warning - HammingDistance_Opt %u matches but length of %u " "is longer than guaranteed.\n", h1, kMaxWidth); } } free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } #endif // ENABLE_ROW_TESTS TEST_F(LibYUVCompareTest, TestHammingDistance) { align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_); align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_); memset(src_a, 255u, benchmark_width_ * benchmark_height_); memset(src_b, 0, benchmark_width_ * benchmark_height_); uint64_t h1 = 0; for (int i = 0; i < benchmark_iterations_; ++i) { h1 = ComputeHammingDistance(src_a, src_b, benchmark_width_ * benchmark_height_); } EXPECT_EQ(benchmark_width_ * benchmark_height_ * 8ULL, h1); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } TEST_F(LibYUVCompareTest, BenchmarkSumSquareError_Opt) { const int kMaxWidth = 4096 * 3; align_buffer_page_end(src_a, kMaxWidth); align_buffer_page_end(src_b, kMaxWidth); memset(src_a, 0, kMaxWidth); memset(src_b, 0, kMaxWidth); memcpy(src_a, "test0123test4567", 16); memcpy(src_b, "tick0123tock4567", 16); uint64_t h1 = ComputeSumSquareError(src_a, src_b, 16); EXPECT_EQ(790u, h1); for (int i = 0; i < kMaxWidth; ++i) { src_a[i] = i; src_b[i] = i; } memset(src_a, 0, kMaxWidth); memset(src_b, 0, kMaxWidth); int count = benchmark_iterations_ * ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth); for (int i = 0; i < count; ++i) { h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth); } EXPECT_EQ(0u, h1); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } TEST_F(LibYUVCompareTest, SumSquareError) { const int kMaxWidth = 4096 * 3; align_buffer_page_end(src_a, kMaxWidth); align_buffer_page_end(src_b, kMaxWidth); memset(src_a, 0, kMaxWidth); memset(src_b, 0, kMaxWidth); uint64_t err; err = ComputeSumSquareError(src_a, src_b, kMaxWidth); EXPECT_EQ(0u, err); memset(src_a, 1, kMaxWidth); err = ComputeSumSquareError(src_a, src_b, kMaxWidth); EXPECT_EQ(static_cast(err), kMaxWidth); memset(src_a, 190, kMaxWidth); memset(src_b, 193, kMaxWidth); err = ComputeSumSquareError(src_a, src_b, kMaxWidth); EXPECT_EQ(static_cast(err), kMaxWidth * 3 * 3); for (int i = 0; i < kMaxWidth; ++i) { src_a[i] = (fastrand() & 0xff); src_b[i] = (fastrand() & 0xff); } MaskCpuFlags(disable_cpu_flags_); uint64_t c_err = ComputeSumSquareError(src_a, src_b, kMaxWidth); MaskCpuFlags(benchmark_cpu_info_); uint64_t opt_err = ComputeSumSquareError(src_a, src_b, kMaxWidth); EXPECT_EQ(c_err, opt_err); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } TEST_F(LibYUVCompareTest, BenchmarkPsnr_Opt) { align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_); align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_); for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { src_a[i] = i; src_b[i] = i; } MaskCpuFlags(benchmark_cpu_info_); double opt_time = get_time(); for (int i = 0; i < benchmark_iterations_; ++i) { CalcFramePsnr(src_a, benchmark_width_, src_b, benchmark_width_, benchmark_width_, benchmark_height_); } opt_time = (get_time() - opt_time) / benchmark_iterations_; printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6); EXPECT_EQ(0, 0); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } TEST_F(LibYUVCompareTest, BenchmarkPsnr_Unaligned) { align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_ + 1); align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_); for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { src_a[i + 1] = i; src_b[i] = i; } MaskCpuFlags(benchmark_cpu_info_); double opt_time = get_time(); for (int i = 0; i < benchmark_iterations_; ++i) { CalcFramePsnr(src_a + 1, benchmark_width_, src_b, benchmark_width_, benchmark_width_, benchmark_height_); } opt_time = (get_time() - opt_time) / benchmark_iterations_; printf("BenchmarkPsnr_Opt - %8.2f us opt\n", opt_time * 1e6); EXPECT_EQ(0, 0); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } TEST_F(LibYUVCompareTest, Psnr) { const int kSrcWidth = benchmark_width_; const int kSrcHeight = benchmark_height_; const int b = 128; const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2); const int kSrcStride = 2 * b + kSrcWidth; align_buffer_page_end(src_a, kSrcPlaneSize); align_buffer_page_end(src_b, kSrcPlaneSize); memset(src_a, 0, kSrcPlaneSize); memset(src_b, 0, kSrcPlaneSize); double err; err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); EXPECT_EQ(err, kMaxPsnr); memset(src_a, 255, kSrcPlaneSize); err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); EXPECT_EQ(err, 0.0); memset(src_a, 1, kSrcPlaneSize); err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); EXPECT_GT(err, 48.0); EXPECT_LT(err, 49.0); for (int i = 0; i < kSrcPlaneSize; ++i) { src_a[i] = i; } err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); EXPECT_GT(err, 2.0); if (kSrcWidth * kSrcHeight >= 256) { EXPECT_LT(err, 6.0); } memset(src_a, 0, kSrcPlaneSize); memset(src_b, 0, kSrcPlaneSize); for (int i = b; i < (kSrcHeight + b); ++i) { for (int j = b; j < (kSrcWidth + b); ++j) { src_a[(i * kSrcStride) + j] = (fastrand() & 0xff); src_b[(i * kSrcStride) + j] = (fastrand() & 0xff); } } MaskCpuFlags(disable_cpu_flags_); double c_err, opt_err; c_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); MaskCpuFlags(benchmark_cpu_info_); opt_err = CalcFramePsnr(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); EXPECT_EQ(opt_err, c_err); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } TEST_F(LibYUVCompareTest, DISABLED_BenchmarkSsim_Opt) { align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_); align_buffer_page_end(src_b, benchmark_width_ * benchmark_height_); for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { src_a[i] = i; src_b[i] = i; } MaskCpuFlags(benchmark_cpu_info_); double opt_time = get_time(); for (int i = 0; i < benchmark_iterations_; ++i) { CalcFrameSsim(src_a, benchmark_width_, src_b, benchmark_width_, benchmark_width_, benchmark_height_); } opt_time = (get_time() - opt_time) / benchmark_iterations_; printf("BenchmarkSsim_Opt - %8.2f us opt\n", opt_time * 1e6); EXPECT_EQ(0, 0); // Pass if we get this far. free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } TEST_F(LibYUVCompareTest, Ssim) { const int kSrcWidth = benchmark_width_; const int kSrcHeight = benchmark_height_; const int b = 128; const int kSrcPlaneSize = (kSrcWidth + b * 2) * (kSrcHeight + b * 2); const int kSrcStride = 2 * b + kSrcWidth; align_buffer_page_end(src_a, kSrcPlaneSize); align_buffer_page_end(src_b, kSrcPlaneSize); memset(src_a, 0, kSrcPlaneSize); memset(src_b, 0, kSrcPlaneSize); if (kSrcWidth <= 8 || kSrcHeight <= 8) { printf("warning - Ssim size too small. Testing function executes.\n"); } double err; err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); if (kSrcWidth > 8 && kSrcHeight > 8) { EXPECT_EQ(err, 1.0); } memset(src_a, 255, kSrcPlaneSize); err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); if (kSrcWidth > 8 && kSrcHeight > 8) { EXPECT_LT(err, 0.0001); } memset(src_a, 1, kSrcPlaneSize); err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); if (kSrcWidth > 8 && kSrcHeight > 8) { EXPECT_GT(err, 0.0001); EXPECT_LT(err, 0.9); } for (int i = 0; i < kSrcPlaneSize; ++i) { src_a[i] = i; } err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); if (kSrcWidth > 8 && kSrcHeight > 8) { EXPECT_GT(err, 0.0); EXPECT_LT(err, 0.01); } for (int i = b; i < (kSrcHeight + b); ++i) { for (int j = b; j < (kSrcWidth + b); ++j) { src_a[(i * kSrcStride) + j] = (fastrand() & 0xff); src_b[(i * kSrcStride) + j] = (fastrand() & 0xff); } } MaskCpuFlags(disable_cpu_flags_); double c_err, opt_err; c_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); MaskCpuFlags(benchmark_cpu_info_); opt_err = CalcFrameSsim(src_a + kSrcStride * b + b, kSrcStride, src_b + kSrcStride * b + b, kSrcStride, kSrcWidth, kSrcHeight); if (kSrcWidth > 8 && kSrcHeight > 8) { EXPECT_EQ(opt_err, c_err); } free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(src_b); } } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/convert_test.cc000066400000000000000000007140201416500237200225760ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include #include "libyuv/basic_types.h" #include "libyuv/compare.h" #include "libyuv/convert.h" #include "libyuv/convert_argb.h" #include "libyuv/convert_from.h" #include "libyuv/convert_from_argb.h" #include "libyuv/cpu_id.h" #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" #endif #include "../unit_test/unit_test.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" #include "libyuv/video_common.h" #ifdef ENABLE_ROW_TESTS #include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */ #endif // Some functions fail on big endian. Enable these tests on all cpus except // PowerPC, but they are not optimized so disabled by default. #if !defined(DISABLE_SLOW_TESTS) && !defined(__powerpc__) #define LITTLE_ENDIAN_ONLY_TEST 1 #endif #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) // SLOW TESTS are those that are unoptimized C code. // FULL TESTS are optimized but test many variations of the same code. #define ENABLE_FULL_TESTS #endif namespace libyuv { // Alias to copy pixels as is #define AR30ToAR30 ARGBCopy #define ABGRToABGR ARGBCopy #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) // Planar test #define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ SRC_DEPTH) \ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ "SRC_SUBSAMP_X unsupported"); \ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ "SRC_SUBSAMP_Y unsupported"); \ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ "DST_SUBSAMP_X unsupported"); \ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ "DST_SUBSAMP_Y unsupported"); \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \ align_buffer_page_end(src_u, \ kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \ align_buffer_page_end(src_v, \ kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \ MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ SRC_T* src_u_p = reinterpret_cast(src_u + OFF); \ SRC_T* src_v_p = reinterpret_cast(src_v + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \ } \ for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) { \ src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1); \ src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1); \ } \ memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \ reinterpret_cast(dst_y_c), kWidth, \ reinterpret_cast(dst_u_c), kDstHalfWidth, \ reinterpret_cast(dst_v_c), kDstHalfWidth, kWidth, \ NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \ reinterpret_cast(dst_y_opt), kWidth, \ reinterpret_cast(dst_u_opt), kDstHalfWidth, \ reinterpret_cast(dst_v_opt), kDstHalfWidth, kWidth, \ NEG kHeight); \ } \ for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \ EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \ } \ for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) { \ EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); \ EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); \ } \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_u_c); \ free_aligned_buffer_page_end(dst_v_c); \ free_aligned_buffer_page_end(dst_y_opt); \ free_aligned_buffer_page_end(dst_u_opt); \ free_aligned_buffer_page_end(dst_v_opt); \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ } #define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ benchmark_width_, _Unaligned, +, 2, SRC_DEPTH) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ benchmark_width_, _Invert, -, 0, SRC_DEPTH) \ TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \ benchmark_width_, _Opt, +, 0, SRC_DEPTH) TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8) TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2, 8) TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2, 8) TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1, 8) TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1, 8) TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2, 8) TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8) TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8) TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8) TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10) TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8) TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I012, uint16_t, 2, 2, 2, 8) TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10) TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10) TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8) TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8) TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10) TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10) TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12) TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12) TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I010, uint16_t, 2, 2, 2, 10) TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I010, uint16_t, 2, 2, 2, 10) TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I012, uint16_t, 2, 2, 2, 12) TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I012, uint16_t, 2, 2, 2, 12) TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10) TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 10) TESTPLANARTOP(I410, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 10) TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 12) TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I422, uint8_t, 1, 2, 1, 12) TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12) // Test Android 420 to I420 #define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ W1280, N, NEG, OFF, PN, OFF_U, OFF_V) \ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##To##PN##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kSizeUV = \ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ align_buffer_page_end(src_uv, \ kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight); \ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ uint8_t* src_u = src_uv + OFF_U; \ uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \ int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE; \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \ (fastrand() & 0xff); \ src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \ (fastrand() & 0xff); \ } \ } \ memset(dst_y_c, 1, kWidth* kHeight); \ memset(dst_u_c, 2, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_v_c, 3, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_y_opt, 101, kWidth* kHeight); \ memset(dst_u_opt, 102, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_v_opt, 103, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \ kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, \ dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ } \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth; ++j) { \ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ } \ } \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ } \ } \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ } \ } \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_u_c); \ free_aligned_buffer_page_end(dst_v_c); \ free_aligned_buffer_page_end(dst_y_opt); \ free_aligned_buffer_page_end(dst_u_opt); \ free_aligned_buffer_page_end(dst_v_opt); \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_uv); \ } #define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V, \ SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, \ SUBSAMP_Y) \ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ + 1, \ _Any, +, 0, PN, OFF_U, OFF_V) \ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, \ _Unaligned, +, 2, PN, OFF_U, OFF_V) \ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \ -, 0, PN, OFF_U, OFF_V) \ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \ 0, PN, OFF_U, OFF_V) TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2) TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2) TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2) #undef TESTAPLANARTOP #undef TESTAPLANARTOPI // wrapper to keep API the same int I400ToNV21(const uint8_t* src_y, int src_stride_y, const uint8_t* /* src_u */, int /* src_stride_u */, const uint8_t* /* src_v */, int /* src_stride_v */, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, int dst_stride_vu, int width, int height) { return I400ToNV21(src_y, src_stride_y, dst_y, dst_stride_y, dst_vu, dst_stride_vu, width, height); } #define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ SRC_DEPTH) \ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ "SRC_SUBSAMP_X unsupported"); \ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ "SRC_SUBSAMP_Y unsupported"); \ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ "DST_SUBSAMP_X unsupported"); \ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ "DST_SUBSAMP_Y unsupported"); \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \ align_buffer_page_end(src_u, \ kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \ align_buffer_page_end(src_v, \ kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ align_buffer_page_end(dst_uv_c, \ kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ align_buffer_page_end(dst_uv_opt, \ kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \ MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ SRC_T* src_u_p = reinterpret_cast(src_u + OFF); \ SRC_T* src_v_p = reinterpret_cast(src_v + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \ } \ for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) { \ src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1); \ src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1); \ } \ memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ memset(dst_uv_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ memset(dst_uv_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y_p, kWidth, src_u_p, kSrcHalfWidth, \ src_v_p, kSrcHalfWidth, \ reinterpret_cast(dst_y_c), kWidth, \ reinterpret_cast(dst_uv_c), \ kDstHalfWidth * 2, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \ reinterpret_cast(dst_y_opt), kWidth, \ reinterpret_cast(dst_uv_opt), kDstHalfWidth * 2, kWidth, \ NEG kHeight); \ } \ for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \ EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \ } \ for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC * 2; ++i) { \ EXPECT_EQ(dst_uv_c[i], dst_uv_opt[i]); \ } \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_uv_c); \ free_aligned_buffer_page_end(dst_y_opt); \ free_aligned_buffer_page_end(dst_uv_opt); \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ } #define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, \ SRC_DEPTH) \ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH) \ TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH) TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8) TESTPLANARTOBP(I420, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8) TESTPLANARTOBP(I422, uint8_t, 1, 2, 1, NV21, uint8_t, 1, 2, 2, 8) TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV12, uint8_t, 1, 2, 2, 8) TESTPLANARTOBP(I444, uint8_t, 1, 1, 1, NV21, uint8_t, 1, 2, 2, 8) TESTPLANARTOBP(I400, uint8_t, 1, 2, 2, NV21, uint8_t, 1, 2, 2, 8) TESTPLANARTOBP(I010, uint16_t, 2, 2, 2, P010, uint16_t, 2, 2, 2, 10) TESTPLANARTOBP(I210, uint16_t, 2, 2, 1, P210, uint16_t, 2, 2, 1, 10) TESTPLANARTOBP(I012, uint16_t, 2, 2, 2, P012, uint16_t, 2, 2, 2, 12) TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) #define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ DOY, SRC_DEPTH) \ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ "SRC_SUBSAMP_X unsupported"); \ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ "SRC_SUBSAMP_Y unsupported"); \ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ "DST_SUBSAMP_X unsupported"); \ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ "DST_SUBSAMP_Y unsupported"); \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \ align_buffer_page_end(src_uv, \ 2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ align_buffer_page_end(dst_uv_c, \ 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ align_buffer_page_end(dst_uv_opt, \ 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ SRC_T* src_uv_p = reinterpret_cast(src_uv + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y_p[i] = \ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ } \ for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) { \ src_uv_p[i] = \ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ } \ memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth, \ DOY ? reinterpret_cast(dst_y_c) : NULL, kWidth, \ reinterpret_cast(dst_uv_c), 2 * kDstHalfWidth, kWidth, \ NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth, \ DOY ? reinterpret_cast(dst_y_opt) : NULL, kWidth, \ reinterpret_cast(dst_uv_opt), 2 * kDstHalfWidth, kWidth, \ NEG kHeight); \ } \ if (DOY) { \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth; ++j) { \ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ } \ } \ } \ for (int i = 0; i < kDstHalfHeight; ++i) { \ for (int j = 0; j < 2 * kDstHalfWidth; ++j) { \ EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j], \ dst_uv_opt[i * 2 * kDstHalfWidth + j]); \ } \ } \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_uv_c); \ free_aligned_buffer_page_end(dst_y_opt); \ free_aligned_buffer_page_end(dst_uv_opt); \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_uv); \ } #define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, 1, \ SRC_DEPTH) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, 1, \ SRC_DEPTH) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1, \ SRC_DEPTH) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH) \ TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0, \ SRC_DEPTH) TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8) TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8) TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8) TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8) TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 10) TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 10) TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 10) TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 12) TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 12) TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 12) #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \ SRC_DEPTH) \ TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ "SRC_SUBSAMP_X unsupported"); \ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ "SRC_SUBSAMP_Y unsupported"); \ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ "DST_SUBSAMP_X unsupported"); \ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ "DST_SUBSAMP_Y unsupported"); \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \ align_buffer_page_end(src_uv, \ kSrcHalfWidth* kSrcHalfHeight* SRC_BPC * 2 + OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ SRC_T* src_uv_p = reinterpret_cast(src_uv + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y_p[i] = \ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ } \ for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight * 2; ++i) { \ src_uv_p[i] = \ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ } \ memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \ reinterpret_cast(dst_y_c), kWidth, \ reinterpret_cast(dst_u_c), kDstHalfWidth, \ reinterpret_cast(dst_v_c), kDstHalfWidth, kWidth, \ NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \ reinterpret_cast(dst_y_opt), kWidth, \ reinterpret_cast(dst_u_opt), kDstHalfWidth, \ reinterpret_cast(dst_v_opt), kDstHalfWidth, kWidth, \ NEG kHeight); \ } \ for (int i = 0; i < kHeight * kWidth * DST_BPC; ++i) { \ EXPECT_EQ(dst_y_c[i], dst_y_opt[i]); \ } \ for (int i = 0; i < kDstHalfWidth * kDstHalfHeight * DST_BPC; ++i) { \ EXPECT_EQ(dst_u_c[i], dst_u_opt[i]); \ EXPECT_EQ(dst_v_c[i], dst_v_opt[i]); \ } \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_u_c); \ free_aligned_buffer_page_end(dst_v_c); \ free_aligned_buffer_page_end(dst_y_opt); \ free_aligned_buffer_page_end(dst_u_opt); \ free_aligned_buffer_page_end(dst_v_opt); \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_uv); \ } #define TESTBIPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \ DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_ + 1, _Any, +, 0, SRC_DEPTH) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 2, \ SRC_DEPTH) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, SRC_DEPTH) \ TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, SRC_DEPTH) TESTBIPLANARTOP(NV12, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8) TESTBIPLANARTOP(NV21, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8) // Provide matrix wrappers for full range bt.709 #define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \ I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j) #define F420ToARGB(a, b, c, d, e, f, g, h, i, j) \ I420ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j) #define F422ToABGR(a, b, c, d, e, f, g, h, i, j) \ I422ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j) #define F422ToARGB(a, b, c, d, e, f, g, h, i, j) \ I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j) #define F444ToABGR(a, b, c, d, e, f, g, h, i, j) \ I444ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j) #define F444ToARGB(a, b, c, d, e, f, g, h, i, j) \ I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j) // Provide matrix wrappers for full range bt.2020 #define V420ToABGR(a, b, c, d, e, f, g, h, i, j) \ I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j) #define V420ToARGB(a, b, c, d, e, f, g, h, i, j) \ I420ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j) #define V422ToABGR(a, b, c, d, e, f, g, h, i, j) \ I422ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j) #define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \ I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j) #define V444ToABGR(a, b, c, d, e, f, g, h, i, j) \ I444ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j) #define V444ToARGB(a, b, c, d, e, f, g, h, i, j) \ I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j) #define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN)) #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ align_buffer_page_end(src_u, kSizeUV + OFF); \ align_buffer_page_end(src_v, kSizeUV + OFF); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y[i + OFF] = (fastrand() & 0xff); \ } \ for (int i = 0; i < kSizeUV; ++i) { \ src_u[i + OFF] = (fastrand() & 0xff); \ src_v[i + OFF] = (fastrand() & 0xff); \ } \ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ MaskCpuFlags(disable_cpu_flags_); \ double time0 = get_time(); \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \ kWidth, NEG kHeight); \ double time1 = get_time(); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ src_v + OFF, kStrideUV, dst_argb_opt + OFF, \ kStrideB, kWidth, NEG kHeight); \ } \ double time2 = get_time(); \ printf(" %8d us C - %8d us OPT\n", \ static_cast((time1 - time0) * 1e6), \ static_cast((time2 - time1) * 1e6 / benchmark_iterations_)); \ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \ } \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } #if defined(ENABLE_FULL_TESTS) #define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN) \ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_ + 1, _Any, +, 0) \ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Unaligned, +, 4) \ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Invert, -, 0) \ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Opt, +, 0) #else #define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN) \ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_ + 1, _Any, +, 0) \ TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Opt, +, 0) #endif #if defined(ENABLE_FULL_TESTS) TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1) TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1) TESTPLANARTOB(J420, 2, 2, ARGB, 4, 4, 1) TESTPLANARTOB(J420, 2, 2, ABGR, 4, 4, 1) TESTPLANARTOB(F420, 2, 2, ARGB, 4, 4, 1) TESTPLANARTOB(F420, 2, 2, ABGR, 4, 4, 1) TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1) TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1) TESTPLANARTOB(U420, 2, 2, ARGB, 4, 4, 1) TESTPLANARTOB(U420, 2, 2, ABGR, 4, 4, 1) TESTPLANARTOB(V420, 2, 2, ARGB, 4, 4, 1) TESTPLANARTOB(V420, 2, 2, ABGR, 4, 4, 1) TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1) TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1) TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1) TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1) TESTPLANARTOB(J420, 2, 2, RAW, 3, 3, 1) TESTPLANARTOB(J420, 2, 2, RGB24, 3, 3, 1) TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1) TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1) TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1) TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1) TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1) TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1) TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1) #endif TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1) TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1) TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1) TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1) TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1) TESTPLANARTOB(U422, 2, 1, ARGB, 4, 4, 1) TESTPLANARTOB(U422, 2, 1, ABGR, 4, 4, 1) TESTPLANARTOB(V422, 2, 1, ARGB, 4, 4, 1) TESTPLANARTOB(V422, 2, 1, ABGR, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1) TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1) TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1) TESTPLANARTOB(J444, 1, 1, ARGB, 4, 4, 1) TESTPLANARTOB(J444, 1, 1, ABGR, 4, 4, 1) TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1) TESTPLANARTOB(H444, 1, 1, ABGR, 4, 4, 1) TESTPLANARTOB(U444, 1, 1, ARGB, 4, 4, 1) TESTPLANARTOB(U444, 1, 1, ABGR, 4, 4, 1) TESTPLANARTOB(V444, 1, 1, ARGB, 4, 4, 1) TESTPLANARTOB(V444, 1, 1, ABGR, 4, 4, 1) TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1) TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1) TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1) TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1) TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1) TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1) TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) TESTPLANARTOB(I420, 2, 2, AB30, 4, 4, 1) TESTPLANARTOB(H420, 2, 2, AB30, 4, 4, 1) #endif #else TESTPLANARTOB(I420, 2, 2, ABGR, 4, 4, 1) TESTPLANARTOB(I420, 2, 2, ARGB, 4, 4, 1) TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1) TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1) TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1) TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1) TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1) TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1) TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1) #endif TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1) TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1) TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1) TESTPLANARTOB(I422, 2, 1, ABGR, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1) TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1) TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1) TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1) TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1) #endif #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, W1280, N, NEG, OFF, ATTEN) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ align_buffer_page_end(src_u, kSizeUV + OFF); \ align_buffer_page_end(src_v, kSizeUV + OFF); \ align_buffer_page_end(src_a, kWidth* kHeight + OFF); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y[i + OFF] = (fastrand() & 0xff); \ src_a[i + OFF] = (fastrand() & 0xff); \ } \ for (int i = 0; i < kSizeUV; ++i) { \ src_u[i + OFF] = (fastrand() & 0xff); \ src_v[i + OFF] = (fastrand() & 0xff); \ } \ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ src_v + OFF, kStrideUV, src_a + OFF, kWidth, \ dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight, \ ATTEN); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ src_v + OFF, kStrideUV, src_a + OFF, kWidth, \ dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, \ ATTEN); \ } \ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \ } \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ free_aligned_buffer_page_end(src_a); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } #if defined(ENABLE_FULL_TESTS) #define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_ + 1, _Any, +, 0, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Unaligned, +, 2, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Invert, -, 0, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Opt, +, 0, 0) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Premult, +, 0, 1) #else #define TESTQPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN) \ TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Opt, +, 0, 0) #endif #define J420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define J420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define F420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define F420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define H420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define H420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define U420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define J422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define F422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define F422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define H422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define H422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define U422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define J444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define F444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define F444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define H444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define H444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define U444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #if defined(ENABLE_FULL_TESTS) TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(F420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(F420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(V420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(I422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(J422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(F422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(F422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(V422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(I444Alpha, 1, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(J444Alpha, 1, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(F444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(F444Alpha, 1, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1) #else TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1) #endif #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \ BPP_B, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStrideB = kWidth * BPP_B; \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ align_buffer_page_end(src_uv, \ kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < kStrideUV * 2; ++j) { \ src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff); \ } \ } \ memset(dst_argb_c, 1, kStrideB* kHeight); \ memset(dst_argb_opt, 101, kStrideB* kHeight); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \ dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \ dst_argb_opt, kWidth * BPP_B, kWidth, \ NEG kHeight); \ } \ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight); \ align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \ memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \ memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \ FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \ kHeight); \ FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \ kHeight); \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth * 4; ++j) { \ EXPECT_EQ(dst_argb32_c[i * kWidth * 4 + j], \ dst_argb32_opt[i * kWidth * 4 + j]); \ } \ } \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_uv); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ free_aligned_buffer_page_end(dst_argb32_c); \ free_aligned_buffer_page_end(dst_argb32_opt); \ } #define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ benchmark_width_ + 1, _Any, +, 0) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ benchmark_width_, _Unaligned, +, 2) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ benchmark_width_, _Invert, -, 0) \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ benchmark_width_, _Opt, +, 0) #define JNV12ToARGB(a, b, c, d, e, f, g, h) \ NV12ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) #define JNV21ToARGB(a, b, c, d, e, f, g, h) \ NV21ToARGBMatrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) #define JNV12ToABGR(a, b, c, d, e, f, g, h) \ NV21ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h) #define JNV21ToABGR(a, b, c, d, e, f, g, h) \ NV12ToARGBMatrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h) #define JNV12ToRGB24(a, b, c, d, e, f, g, h) \ NV12ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) #define JNV21ToRGB24(a, b, c, d, e, f, g, h) \ NV21ToRGB24Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) #define JNV12ToRAW(a, b, c, d, e, f, g, h) \ NV21ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h) #define JNV21ToRAW(a, b, c, d, e, f, g, h) \ NV12ToRGB24Matrix(a, b, c, d, e, f, &kYvuJPEGConstants, g, h) #define JNV12ToRGB565(a, b, c, d, e, f, g, h) \ NV12ToRGB565Matrix(a, b, c, d, e, f, &kYuvJPEGConstants, g, h) TESTBIPLANARTOB(JNV12, 2, 2, ARGB, ARGB, 4) TESTBIPLANARTOB(JNV21, 2, 2, ARGB, ARGB, 4) TESTBIPLANARTOB(JNV12, 2, 2, ABGR, ABGR, 4) TESTBIPLANARTOB(JNV21, 2, 2, ABGR, ABGR, 4) TESTBIPLANARTOB(JNV12, 2, 2, RGB24, RGB24, 3) TESTBIPLANARTOB(JNV21, 2, 2, RGB24, RGB24, 3) TESTBIPLANARTOB(JNV12, 2, 2, RAW, RAW, 3) TESTBIPLANARTOB(JNV21, 2, 2, RAW, RAW, 3) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTBIPLANARTOB(JNV12, 2, 2, RGB565, RGB565, 2) #endif TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4) TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4) TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4) TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4) TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3) TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3) TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3) TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3) TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2) #endif #define TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \ align_buffer_page_end(src_argb, kStride* kHeight + OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight); \ align_buffer_page_end(dst_uv_c, \ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ align_buffer_page_end(dst_uv_opt, \ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_y_c, 1, kWidth* kHeight); \ memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_y_opt, 101, kWidth* kHeight); \ memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kStride; ++j) \ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \ kStrideUV * 2, dst_uv_c + kStrideUV, kStrideUV * 2, \ kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \ dst_uv_opt, kStrideUV * 2, dst_uv_opt + kStrideUV, \ kStrideUV * 2, kWidth, NEG kHeight); \ } \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth; ++j) { \ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ } \ } \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; ++i) { \ for (int j = 0; j < kStrideUV; ++j) { \ EXPECT_EQ(dst_uv_c[i * kStrideUV + j], dst_uv_opt[i * kStrideUV + j]); \ } \ } \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_uv_c); \ free_aligned_buffer_page_end(dst_y_opt); \ free_aligned_buffer_page_end(dst_uv_opt); \ free_aligned_buffer_page_end(src_argb); \ } #if defined(ENABLE_FULL_TESTS) #define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_ + 1, _Any, +, 0) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_, _Unaligned, +, 2) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_, _Invert, -, 0) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_, _Opt, +, 0) #else #define TESTATOPLANAR(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_ + 1, _Any, +, 0) \ TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_, _Opt, +, 0) #endif TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2) TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2) TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1) TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1) TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2) TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2) TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2) TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2) #endif TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2) TESTATOPLANAR(I400, 1, 1, I420, 2, 2) TESTATOPLANAR(J400, 1, 1, J420, 2, 2) TESTATOPLANAR(RAW, 3, 1, I420, 2, 2) TESTATOPLANAR(RAW, 3, 1, J420, 2, 2) TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2) TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2) TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2) TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2) TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1) TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2) TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1) #define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \ SUBSAMP_Y, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_PLANAR##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ align_buffer_page_end(src_argb, kStride* kHeight + OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight); \ align_buffer_page_end(dst_uv_c, \ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ align_buffer_page_end(dst_uv_opt, \ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kStride; ++j) \ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \ memset(dst_y_c, 1, kWidth* kHeight); \ memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_y_opt, 101, kWidth* kHeight); \ memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \ kStrideUV * 2, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_opt, kWidth, \ dst_uv_opt, kStrideUV * 2, kWidth, NEG kHeight); \ } \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth; ++j) { \ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ } \ } \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < kStrideUV * 2; ++j) { \ EXPECT_EQ(dst_uv_c[i * kStrideUV * 2 + j], \ dst_uv_opt[i * kStrideUV * 2 + j]); \ } \ } \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_uv_c); \ free_aligned_buffer_page_end(dst_y_opt); \ free_aligned_buffer_page_end(dst_uv_opt); \ free_aligned_buffer_page_end(src_argb); \ } #define TESTATOBIPLANAR(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_ + 1, _Any, +, 0) \ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_, _Unaligned, +, 2) \ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_, _Invert, -, 0) \ TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_, _Opt, +, 0) TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2) TESTATOBIPLANAR(ABGR, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(ABGR, 1, 4, NV21, 2, 2) TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2) TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2) TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) #define TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ const int kStrideA = \ (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ const int kStrideB = \ (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ align_buffer_page_end(src_argb, \ kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ align_buffer_page_end(dst_argb_opt, \ kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ src_argb[i + OFF] = (fastrand() & 0xff); \ } \ memset(dst_argb_c, 1, kStrideB* kHeightB); \ memset(dst_argb_opt, 101, kStrideB* kHeightB); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_B*)dst_argb_c, \ kStrideB, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, \ (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight); \ } \ for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ free_aligned_buffer_page_end(src_argb); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } #define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, \ TYPE_B, EPP_B, STRIDE_B, HEIGHT_B) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \ for (int times = 0; times < benchmark_iterations_; ++times) { \ const int kWidth = (fastrand() & 63) + 1; \ const int kHeight = (fastrand() & 31) + 1; \ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ const int kStrideA = \ (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ const int kStrideB = \ (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ align_buffer_page_end(src_argb, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \ align_buffer_page_end(dst_argb_c, \ kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ align_buffer_page_end(dst_argb_opt, \ kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ src_argb[i] = 0xfe; \ } \ memset(dst_argb_c, 123, kStrideB* kHeightB); \ memset(dst_argb_opt, 123, kStrideB* kHeightB); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c, \ kStrideB, kWidth, kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt, \ kStrideB, kWidth, kHeight); \ for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ free_aligned_buffer_page_end(src_argb); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } \ } #if defined(ENABLE_FULL_TESTS) #define TESTATOB(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ EPP_B, STRIDE_B, HEIGHT_B) \ TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ STRIDE_B, HEIGHT_B, benchmark_width_ + 1, _Any, +, 0) \ TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ STRIDE_B, HEIGHT_B, benchmark_width_, _Unaligned, +, 4) \ TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ STRIDE_B, HEIGHT_B, benchmark_width_, _Invert, -, 0) \ TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ STRIDE_B, HEIGHT_B, benchmark_width_, _Opt, +, 0) \ TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ EPP_B, STRIDE_B, HEIGHT_B) #else #define TESTATOB(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ EPP_B, STRIDE_B, HEIGHT_B) \ TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ STRIDE_B, HEIGHT_B, benchmark_width_, _Opt, +, 0) #endif TESTATOB(AB30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) TESTATOB(AB30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTATOB(ABGR, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) #endif TESTATOB(ABGR, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTATOB(AR30, uint8_t, 4, 4, 1, AB30, uint8_t, 4, 4, 1) #endif TESTATOB(AR30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTATOB(AR30, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) TESTATOB(AR30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) #endif TESTATOB(ARGB, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTATOB(ARGB, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) #endif TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB1555, uint8_t, 2, 2, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB4444, uint8_t, 2, 2, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) TESTATOB(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1) TESTATOB(ABGR, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1) TESTATOB(ABGR, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1) #endif TESTATOB(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1) // 4 TESTATOB(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(I400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(I400, uint8_t, 1, 1, 1, I400, uint8_t, 1, 1, 1) TESTATOB(I400, uint8_t, 1, 1, 1, I400Mirror, uint8_t, 1, 1, 1) TESTATOB(J400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(J400, uint8_t, 1, 1, 1, J400, uint8_t, 1, 1, 1) TESTATOB(RAW, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(RAW, uint8_t, 3, 3, 1, RGBA, uint8_t, 4, 4, 1) TESTATOB(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1) TESTATOB(RGB24, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) TESTATOB(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1) TESTATOB(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTATOB(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) #endif TESTATOB(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(YUY2, uint8_t, 2, 4, 1, Y, uint8_t, 1, 1, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) TESTATOB(ARGB, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) TESTATOB(ABGR, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) TESTATOB(ABGR, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) TESTATOB(AR64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(AB64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) TESTATOB(AR64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) TESTATOB(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) TESTATOB(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) #define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ const int kStrideA = \ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ const int kStrideB = \ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ for (int i = 0; i < kStrideA * kHeightA; ++i) { \ src_argb[i + OFF] = (fastrand() & 0xff); \ } \ memset(dst_argb_c, 1, kStrideB* kHeightB); \ memset(dst_argb_opt, 101, kStrideB* kHeightB); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \ NULL, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_opt, \ kStrideB, NULL, kWidth, NEG kHeight); \ } \ for (int i = 0; i < kStrideB * kHeightB; ++i) { \ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ free_aligned_buffer_page_end(src_argb); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } #define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \ STRIDE_B, HEIGHT_B) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##Dither_Random) { \ for (int times = 0; times < benchmark_iterations_; ++times) { \ const int kWidth = (fastrand() & 63) + 1; \ const int kHeight = (fastrand() & 31) + 1; \ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ const int kStrideA = \ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ const int kStrideB = \ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ align_buffer_page_end(src_argb, kStrideA* kHeightA); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ for (int i = 0; i < kStrideA * kHeightA; ++i) { \ src_argb[i] = (fastrand() & 0xff); \ } \ memset(dst_argb_c, 123, kStrideB* kHeightB); \ memset(dst_argb_opt, 123, kStrideB* kHeightB); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \ kWidth, kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_opt, kStrideB, \ NULL, kWidth, kHeight); \ for (int i = 0; i < kStrideB * kHeightB; ++i) { \ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ free_aligned_buffer_page_end(src_argb); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } \ } #define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B) \ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B, benchmark_width_ + 1, _Any, +, 0) \ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B, benchmark_width_, _Unaligned, +, 2) \ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B, benchmark_width_, _Invert, -, 0) \ TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B, benchmark_width_, _Opt, +, 0) \ TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1) #endif // These conversions called twice, produce the original result. // e.g. endian swap twice. #define TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, \ OFF) \ TEST_F(LibYUVConvertTest, FMT_ATOB##_Endswap##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ const int kStrideA = \ (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ align_buffer_page_end(src_argb, \ kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ align_buffer_page_end(dst_argb_c, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \ align_buffer_page_end(dst_argb_opt, \ kStrideA* kHeightA*(int)sizeof(TYPE_A)); \ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ src_argb[i + OFF] = (fastrand() & 0xff); \ } \ memset(dst_argb_c, 1, kStrideA* kHeightA); \ memset(dst_argb_opt, 101, kStrideA* kHeightA); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_c, \ kStrideA, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_opt, \ kStrideA, kWidth, NEG kHeight); \ } \ MaskCpuFlags(disable_cpu_flags_); \ FMT_ATOB((TYPE_A*)dst_argb_c, kStrideA, (TYPE_A*)dst_argb_c, kStrideA, \ kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ FMT_ATOB((TYPE_A*)dst_argb_opt, kStrideA, (TYPE_A*)dst_argb_opt, kStrideA, \ kWidth, NEG kHeight); \ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ free_aligned_buffer_page_end(src_argb); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } #if defined(ENABLE_FULL_TESTS) #define TESTEND(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A) \ TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ + 1, \ _Any, +, 0) \ TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \ _Unaligned, +, 2) \ TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \ _Opt, +, 0) #else #define TESTEND(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A) \ TESTENDI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \ _Opt, +, 0) #endif TESTEND(ARGBToBGRA, uint8_t, 4, 4, 1) TESTEND(ARGBToABGR, uint8_t, 4, 4, 1) TESTEND(BGRAToARGB, uint8_t, 4, 4, 1) TESTEND(ABGRToARGB, uint8_t, 4, 4, 1) TESTEND(AB64ToAR64, uint16_t, 4, 4, 1) #ifdef HAVE_JPEG TEST_F(LibYUVConvertTest, ValidateJpeg) { const int kOff = 10; const int kMinJpeg = 64; const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ? benchmark_width_ * benchmark_height_ : kMinJpeg; const int kSize = kImageSize + kOff; align_buffer_page_end(orig_pixels, kSize); // No SOI or EOI. Expect fail. memset(orig_pixels, 0, kSize); EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize)); // Test special value that matches marker start. memset(orig_pixels, 0xff, kSize); EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize)); // EOI, SOI. Expect pass. orig_pixels[0] = 0xff; orig_pixels[1] = 0xd8; // SOI. orig_pixels[2] = 0xff; orig_pixels[kSize - kOff + 0] = 0xff; orig_pixels[kSize - kOff + 1] = 0xd9; // EOI. for (int times = 0; times < benchmark_iterations_; ++times) { EXPECT_TRUE(ValidateJpeg(orig_pixels, kSize)); } free_aligned_buffer_page_end(orig_pixels); } TEST_F(LibYUVConvertTest, ValidateJpegLarge) { const int kOff = 10; const int kMinJpeg = 64; const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ? benchmark_width_ * benchmark_height_ : kMinJpeg; const int kSize = kImageSize + kOff; const int kMultiple = 10; const int kBufSize = kImageSize * kMultiple + kOff; align_buffer_page_end(orig_pixels, kBufSize); // No SOI or EOI. Expect fail. memset(orig_pixels, 0, kBufSize); EXPECT_FALSE(ValidateJpeg(orig_pixels, kBufSize)); // EOI, SOI. Expect pass. orig_pixels[0] = 0xff; orig_pixels[1] = 0xd8; // SOI. orig_pixels[2] = 0xff; orig_pixels[kSize - kOff + 0] = 0xff; orig_pixels[kSize - kOff + 1] = 0xd9; // EOI. for (int times = 0; times < benchmark_iterations_; ++times) { EXPECT_TRUE(ValidateJpeg(orig_pixels, kBufSize)); } free_aligned_buffer_page_end(orig_pixels); } TEST_F(LibYUVConvertTest, InvalidateJpeg) { const int kOff = 10; const int kMinJpeg = 64; const int kImageSize = benchmark_width_ * benchmark_height_ >= kMinJpeg ? benchmark_width_ * benchmark_height_ : kMinJpeg; const int kSize = kImageSize + kOff; align_buffer_page_end(orig_pixels, kSize); // NULL pointer. Expect fail. EXPECT_FALSE(ValidateJpeg(NULL, kSize)); // Negative size. Expect fail. EXPECT_FALSE(ValidateJpeg(orig_pixels, -1)); // Too large size. Expect fail. EXPECT_FALSE(ValidateJpeg(orig_pixels, 0xfb000000ull)); // No SOI or EOI. Expect fail. memset(orig_pixels, 0, kSize); EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize)); // SOI but no EOI. Expect fail. orig_pixels[0] = 0xff; orig_pixels[1] = 0xd8; // SOI. orig_pixels[2] = 0xff; for (int times = 0; times < benchmark_iterations_; ++times) { EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize)); } // EOI but no SOI. Expect fail. orig_pixels[0] = 0; orig_pixels[1] = 0; orig_pixels[kSize - kOff + 0] = 0xff; orig_pixels[kSize - kOff + 1] = 0xd9; // EOI. EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize)); free_aligned_buffer_page_end(orig_pixels); } TEST_F(LibYUVConvertTest, FuzzJpeg) { // SOI but no EOI. Expect fail. for (int times = 0; times < benchmark_iterations_; ++times) { const int kSize = fastrand() % 5000 + 3; align_buffer_page_end(orig_pixels, kSize); MemRandomize(orig_pixels, kSize); // Add SOI so frame will be scanned. orig_pixels[0] = 0xff; orig_pixels[1] = 0xd8; // SOI. orig_pixels[2] = 0xff; orig_pixels[kSize - 1] = 0xff; ValidateJpeg(orig_pixels, kSize); // Failure normally expected. free_aligned_buffer_page_end(orig_pixels); } } // Test data created in GIMP. In export jpeg, disable // thumbnails etc, choose a subsampling, and use low quality // (50) to keep size small. Generated with xxd -i test.jpg // test 0 is J400 static const uint8_t kTest0Jpg[] = { 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43, 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12, 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23, 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40, 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51, 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64, 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xc2, 0x00, 0x0b, 0x08, 0x00, 0x10, 0x00, 0x20, 0x01, 0x01, 0x11, 0x00, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x00, 0x01, 0x43, 0x7e, 0xa7, 0x97, 0x57, 0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02, 0x3b, 0xc0, 0x6f, 0x66, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x65, 0x6e, 0x31, 0x86, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x00, 0x10, 0x35, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x0b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x88, 0xab, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9}; static const size_t kTest0JpgLen = 421; // test 1 is J444 static const uint8_t kTest1Jpg[] = { 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43, 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12, 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23, 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40, 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51, 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64, 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12, 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03, 0x01, 0x11, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x01, 0x03, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01, 0x40, 0x8f, 0x26, 0xe8, 0xf4, 0xcc, 0xf9, 0x69, 0x2b, 0x1b, 0x2a, 0xcb, 0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x01, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x10, 0x11, 0x02, 0x12, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0xf1, 0x00, 0x27, 0x45, 0xbb, 0x31, 0xaf, 0xff, 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x02, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x10, 0x11, 0x41, 0x12, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xf6, 0x4b, 0x5f, 0x48, 0xb3, 0x69, 0x63, 0x35, 0x72, 0xbf, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x26, 0x61, 0xd4, 0xff, 0xc4, 0x00, 0x1a, 0x11, 0x00, 0x03, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0x41, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x54, 0xa8, 0xbf, 0x50, 0x87, 0xb0, 0x9d, 0x8b, 0xc4, 0x6a, 0x26, 0x6b, 0x2a, 0x9c, 0x1f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x21, 0x51, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x70, 0xe1, 0x3e, 0xd1, 0x8e, 0x0d, 0xe1, 0xb5, 0xd5, 0x91, 0x76, 0x43, 0x82, 0x45, 0x4c, 0x7b, 0x7f, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9}; static const size_t kTest1JpgLen = 735; // test 2 is J420 static const uint8_t kTest2Jpg[] = { 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43, 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12, 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23, 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40, 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51, 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64, 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12, 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03, 0x01, 0x22, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00, 0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x05, 0x01, 0x02, 0x04, 0xff, 0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x01, 0x02, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01, 0x20, 0xe7, 0x28, 0xa3, 0x0b, 0x2e, 0x2d, 0xcf, 0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0xc8, 0x53, 0xff, 0xc4, 0x00, 0x16, 0x11, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x32, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xd2, 0xc7, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x13, 0x5f, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x0e, 0xa1, 0x3a, 0x76, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x21, 0x11, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x57, 0x0b, 0x08, 0x70, 0xdb, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9}; static const size_t kTest2JpgLen = 685; // test 3 is J422 static const uint8_t kTest3Jpg[] = { 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43, 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12, 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23, 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40, 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51, 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64, 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12, 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03, 0x01, 0x21, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00, 0x17, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x01, 0x02, 0xff, 0xc4, 0x00, 0x17, 0x01, 0x00, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x00, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01, 0x43, 0x8d, 0x1f, 0xa2, 0xb3, 0xca, 0x1b, 0x57, 0x0f, 0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x10, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0x51, 0xce, 0x8c, 0x75, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x61, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xa6, 0xd9, 0x2f, 0x84, 0xe8, 0xf0, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x2e, 0x45, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x53, 0x50, 0xba, 0x54, 0xc1, 0x67, 0x4f, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x00, 0x10, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0x18, 0x81, 0x5c, 0x04, 0x1a, 0xca, 0x91, 0xbf, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9}; static const size_t kTest3JpgLen = 704; // test 4 is J422 vertical - not supported static const uint8_t kTest4Jpg[] = { 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x01, 0x00, 0x48, 0x00, 0x48, 0x00, 0x00, 0xff, 0xdb, 0x00, 0x43, 0x00, 0x10, 0x0b, 0x0c, 0x0e, 0x0c, 0x0a, 0x10, 0x0e, 0x0d, 0x0e, 0x12, 0x11, 0x10, 0x13, 0x18, 0x28, 0x1a, 0x18, 0x16, 0x16, 0x18, 0x31, 0x23, 0x25, 0x1d, 0x28, 0x3a, 0x33, 0x3d, 0x3c, 0x39, 0x33, 0x38, 0x37, 0x40, 0x48, 0x5c, 0x4e, 0x40, 0x44, 0x57, 0x45, 0x37, 0x38, 0x50, 0x6d, 0x51, 0x57, 0x5f, 0x62, 0x67, 0x68, 0x67, 0x3e, 0x4d, 0x71, 0x79, 0x70, 0x64, 0x78, 0x5c, 0x65, 0x67, 0x63, 0xff, 0xdb, 0x00, 0x43, 0x01, 0x11, 0x12, 0x12, 0x18, 0x15, 0x18, 0x2f, 0x1a, 0x1a, 0x2f, 0x63, 0x42, 0x38, 0x42, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0xff, 0xc2, 0x00, 0x11, 0x08, 0x00, 0x10, 0x00, 0x20, 0x03, 0x01, 0x12, 0x00, 0x02, 0x11, 0x01, 0x03, 0x11, 0x01, 0xff, 0xc4, 0x00, 0x18, 0x00, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x05, 0x01, 0x02, 0x03, 0xff, 0xc4, 0x00, 0x16, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x03, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x10, 0x03, 0x10, 0x00, 0x00, 0x01, 0xd2, 0x98, 0xe9, 0x03, 0x0c, 0x00, 0x46, 0x21, 0xd9, 0xff, 0xc4, 0x00, 0x1b, 0x10, 0x00, 0x03, 0x00, 0x02, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x11, 0x00, 0x03, 0x10, 0x12, 0x13, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x05, 0x02, 0x3b, 0x80, 0x6f, 0x56, 0x76, 0x56, 0x23, 0x87, 0x99, 0x0d, 0x26, 0x62, 0xf6, 0xbf, 0xff, 0xc4, 0x00, 0x17, 0x11, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x01, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x01, 0x98, 0xb1, 0xbd, 0x47, 0xff, 0xc4, 0x00, 0x18, 0x11, 0x00, 0x03, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x12, 0x11, 0x21, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x01, 0xb6, 0x35, 0xa2, 0xe1, 0x47, 0xff, 0xc4, 0x00, 0x1e, 0x10, 0x00, 0x02, 0x01, 0x03, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x21, 0x02, 0x12, 0x32, 0x10, 0x31, 0x71, 0x81, 0xa1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x06, 0x3f, 0x02, 0x4b, 0xb3, 0x28, 0x32, 0xd2, 0xed, 0xf9, 0x1d, 0x3e, 0x13, 0x51, 0x73, 0x83, 0xff, 0xc4, 0x00, 0x1c, 0x10, 0x01, 0x01, 0x01, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x00, 0x21, 0x51, 0x31, 0x61, 0x81, 0xf0, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x21, 0x75, 0x6e, 0x31, 0x94, 0x28, 0xf9, 0x30, 0xdc, 0x27, 0xdb, 0xa9, 0x01, 0xf3, 0xde, 0x02, 0xa0, 0xed, 0x1e, 0x34, 0x68, 0x23, 0xf9, 0xc6, 0x48, 0x5d, 0x7a, 0x35, 0x02, 0xf5, 0x6f, 0xff, 0xda, 0x00, 0x0c, 0x03, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x00, 0x00, 0x10, 0x24, 0xaf, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x03, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x51, 0x21, 0x31, 0xff, 0xda, 0x00, 0x08, 0x01, 0x03, 0x01, 0x01, 0x3f, 0x10, 0x59, 0x11, 0xca, 0x42, 0x60, 0x9f, 0x69, 0xff, 0xc4, 0x00, 0x19, 0x11, 0x00, 0x02, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x11, 0x21, 0x31, 0x61, 0xff, 0xda, 0x00, 0x08, 0x01, 0x02, 0x01, 0x01, 0x3f, 0x10, 0xb0, 0xd7, 0x27, 0x51, 0xb6, 0x41, 0xff, 0xc4, 0x00, 0x1f, 0x10, 0x01, 0x00, 0x02, 0x01, 0x04, 0x03, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x11, 0x31, 0x41, 0x61, 0x71, 0x91, 0x21, 0x81, 0xd1, 0xb1, 0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x01, 0x3f, 0x10, 0x1b, 0x30, 0xe9, 0x58, 0xbe, 0x1a, 0xfd, 0x8a, 0xeb, 0x8b, 0x34, 0x74, 0x80, 0x4b, 0xb5, 0xd5, 0xab, 0xcd, 0x46, 0x96, 0x2e, 0xec, 0xbd, 0xaa, 0x78, 0x47, 0x5c, 0x47, 0xa7, 0x30, 0x49, 0xad, 0x88, 0x7c, 0x40, 0x74, 0x30, 0xff, 0x00, 0x23, 0x1d, 0x03, 0x0b, 0xb7, 0xd4, 0xff, 0xd9}; static const size_t kTest4JpgLen = 701; TEST_F(LibYUVConvertTest, TestMJPGSize) { int width = 0; int height = 0; int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height); EXPECT_EQ(0, ret); printf("test jpeg size %d x %d\n", width, height); } TEST_F(LibYUVConvertTest, TestMJPGToI420) { int width = 0; int height = 0; int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height); EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_u, half_width * half_height); align_buffer_page_end(dst_v, half_width * half_height); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_u, half_width, dst_v, half_width, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Test result matches known hash value. uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); uint32_t dst_u_hash = HashDjb2(dst_u, half_width * half_height, 5381); uint32_t dst_v_hash = HashDjb2(dst_v, half_width * half_height, 5381); EXPECT_EQ(dst_y_hash, 2682851208u); EXPECT_EQ(dst_u_hash, 2501859930u); EXPECT_EQ(dst_v_hash, 2126459123u); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_u); free_aligned_buffer_page_end(dst_v); } TEST_F(LibYUVConvertTest, TestMJPGToI420_NV21) { int width = 0; int height = 0; int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height); EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); // Convert to NV21 align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_vu, half_width * half_height * 2); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_vu, half_width * 2, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Convert to I420 align_buffer_page_end(dst2_y, width * height); align_buffer_page_end(dst2_u, half_width * half_height); align_buffer_page_end(dst2_v, half_width * half_height); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width, dst2_v, half_width, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Convert I420 to NV21 align_buffer_page_end(dst3_y, width * height); align_buffer_page_end(dst3_vu, half_width * half_height * 2); I420ToNV21(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y, width, dst3_vu, half_width * 2, width, height); for (int i = 0; i < width * height; ++i) { EXPECT_EQ(dst_y[i], dst3_y[i]); } for (int i = 0; i < half_width * half_height * 2; ++i) { EXPECT_EQ(dst_vu[i], dst3_vu[i]); EXPECT_EQ(dst_vu[i], dst3_vu[i]); } free_aligned_buffer_page_end(dst3_y); free_aligned_buffer_page_end(dst3_vu); free_aligned_buffer_page_end(dst2_y); free_aligned_buffer_page_end(dst2_u); free_aligned_buffer_page_end(dst2_v); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_vu); } TEST_F(LibYUVConvertTest, TestMJPGToI420_NV12) { int width = 0; int height = 0; int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height); EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); // Convert to NV12 align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv, half_width * 2, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Convert to I420 align_buffer_page_end(dst2_y, width * height); align_buffer_page_end(dst2_u, half_width * half_height); align_buffer_page_end(dst2_v, half_width * half_height); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToI420(kTest2Jpg, kTest2JpgLen, dst2_y, width, dst2_u, half_width, dst2_v, half_width, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Convert I420 to NV12 align_buffer_page_end(dst3_y, width * height); align_buffer_page_end(dst3_uv, half_width * half_height * 2); I420ToNV12(dst2_y, width, dst2_u, half_width, dst2_v, half_width, dst3_y, width, dst3_uv, half_width * 2, width, height); for (int i = 0; i < width * height; ++i) { EXPECT_EQ(dst_y[i], dst3_y[i]); } for (int i = 0; i < half_width * half_height * 2; ++i) { EXPECT_EQ(dst_uv[i], dst3_uv[i]); EXPECT_EQ(dst_uv[i], dst3_uv[i]); } free_aligned_buffer_page_end(dst3_y); free_aligned_buffer_page_end(dst3_uv); free_aligned_buffer_page_end(dst2_y); free_aligned_buffer_page_end(dst2_u); free_aligned_buffer_page_end(dst2_v); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_uv); } TEST_F(LibYUVConvertTest, TestMJPGToNV21_420) { int width = 0; int height = 0; int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height); EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToNV21(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv, half_width * 2, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Test result matches known hash value. uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381); EXPECT_EQ(dst_y_hash, 2682851208u); EXPECT_EQ(dst_uv_hash, 1069662856u); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_uv); } TEST_F(LibYUVConvertTest, TestMJPGToNV12_420) { int width = 0; int height = 0; int ret = MJPGSize(kTest2Jpg, kTest2JpgLen, &width, &height); EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToNV12(kTest2Jpg, kTest2JpgLen, dst_y, width, dst_uv, half_width * 2, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Test result matches known hash value. Hashes are for VU so flip the plane. uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); align_buffer_page_end(dst_vu, half_width * half_height * 2); SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width, half_height); uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381); EXPECT_EQ(dst_y_hash, 2682851208u); EXPECT_EQ(dst_vu_hash, 1069662856u); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_uv); free_aligned_buffer_page_end(dst_vu); } TEST_F(LibYUVConvertTest, TestMJPGToNV21_422) { int width = 0; int height = 0; int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height); EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToNV21(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv, half_width * 2, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Test result matches known hash value. uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381); EXPECT_EQ(dst_y_hash, 2682851208u); EXPECT_EQ(dst_uv_hash, 3543430771u); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_uv); } TEST_F(LibYUVConvertTest, TestMJPGToNV12_422) { int width = 0; int height = 0; int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height); EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToNV12(kTest3Jpg, kTest3JpgLen, dst_y, width, dst_uv, half_width * 2, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Test result matches known hash value. Hashes are for VU so flip the plane. uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); align_buffer_page_end(dst_vu, half_width * half_height * 2); SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width, half_height); uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381); EXPECT_EQ(dst_y_hash, 2682851208u); EXPECT_EQ(dst_vu_hash, 3543430771u); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_uv); free_aligned_buffer_page_end(dst_vu); } TEST_F(LibYUVConvertTest, TestMJPGToNV21_400) { int width = 0; int height = 0; int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height); EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToNV21(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv, half_width * 2, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Test result matches known hash value. uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381); EXPECT_EQ(dst_y_hash, 330644005u); EXPECT_EQ(dst_uv_hash, 135214341u); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_uv); } TEST_F(LibYUVConvertTest, TestMJPGToNV12_400) { int width = 0; int height = 0; int ret = MJPGSize(kTest0Jpg, kTest0JpgLen, &width, &height); EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToNV12(kTest0Jpg, kTest0JpgLen, dst_y, width, dst_uv, half_width * 2, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Test result matches known hash value. Hashes are for VU so flip the plane. uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); align_buffer_page_end(dst_vu, half_width * half_height * 2); SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width, half_height); uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381); EXPECT_EQ(dst_y_hash, 330644005u); EXPECT_EQ(dst_vu_hash, 135214341u); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_uv); free_aligned_buffer_page_end(dst_vu); } TEST_F(LibYUVConvertTest, TestMJPGToNV21_444) { int width = 0; int height = 0; int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height); EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToNV21(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv, half_width * 2, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Test result matches known hash value. uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); uint32_t dst_uv_hash = HashDjb2(dst_uv, half_width * half_height * 2, 5381); EXPECT_EQ(dst_y_hash, 2682851208u); EXPECT_EQ(dst_uv_hash, 506143297u); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_uv); } TEST_F(LibYUVConvertTest, TestMJPGToNV12_444) { int width = 0; int height = 0; int ret = MJPGSize(kTest1Jpg, kTest1JpgLen, &width, &height); EXPECT_EQ(0, ret); int half_width = (width + 1) / 2; int half_height = (height + 1) / 2; int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); align_buffer_page_end(dst_y, width * height); align_buffer_page_end(dst_uv, half_width * half_height * 2); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToNV12(kTest1Jpg, kTest1JpgLen, dst_y, width, dst_uv, half_width * 2, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Test result matches known hash value. Hashes are for VU so flip the plane. uint32_t dst_y_hash = HashDjb2(dst_y, width * height, 5381); align_buffer_page_end(dst_vu, half_width * half_height * 2); SwapUVPlane(dst_uv, half_width * 2, dst_vu, half_width * 2, half_width, half_height); uint32_t dst_vu_hash = HashDjb2(dst_vu, half_width * half_height * 2, 5381); EXPECT_EQ(dst_y_hash, 2682851208u); EXPECT_EQ(dst_vu_hash, 506143297u); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_uv); free_aligned_buffer_page_end(dst_vu); } TEST_F(LibYUVConvertTest, TestMJPGToARGB) { int width = 0; int height = 0; int ret = MJPGSize(kTest3Jpg, kTest3JpgLen, &width, &height); EXPECT_EQ(0, ret); int benchmark_iterations = benchmark_iterations_ * benchmark_width_ * benchmark_height_ / (width * height); align_buffer_page_end(dst_argb, width * height * 4); for (int times = 0; times < benchmark_iterations; ++times) { ret = MJPGToARGB(kTest3Jpg, kTest3JpgLen, dst_argb, width * 4, width, height, width, height); } // Expect sucesss EXPECT_EQ(0, ret); // Test result matches known hash value. uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381); #ifdef LIBYUV_UNLIMITED_DATA EXPECT_EQ(dst_argb_hash, 3900633302u); #else EXPECT_EQ(dst_argb_hash, 2355976473u); #endif free_aligned_buffer_page_end(dst_argb); } static int ShowJPegInfo(const uint8_t* sample, size_t sample_size) { MJpegDecoder mjpeg_decoder; LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size); int width = mjpeg_decoder.GetWidth(); int height = mjpeg_decoder.GetHeight(); // YUV420 if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 2 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { printf("JPeg is J420, %dx%d %d bytes\n", width, height, static_cast(sample_size)); // YUV422 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 2 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { printf("JPeg is J422, %dx%d %d bytes\n", width, height, static_cast(sample_size)); // YUV444 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr && mjpeg_decoder.GetNumComponents() == 3 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1 && mjpeg_decoder.GetVertSampFactor(1) == 1 && mjpeg_decoder.GetHorizSampFactor(1) == 1 && mjpeg_decoder.GetVertSampFactor(2) == 1 && mjpeg_decoder.GetHorizSampFactor(2) == 1) { printf("JPeg is J444, %dx%d %d bytes\n", width, height, static_cast(sample_size)); // YUV400 } else if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceGrayscale && mjpeg_decoder.GetNumComponents() == 1 && mjpeg_decoder.GetVertSampFactor(0) == 1 && mjpeg_decoder.GetHorizSampFactor(0) == 1) { printf("JPeg is J400, %dx%d %d bytes\n", width, height, static_cast(sample_size)); } else { // Unknown colorspace. printf("JPeg is Unknown colorspace.\n"); } mjpeg_decoder.UnloadFrame(); return ret; } TEST_F(LibYUVConvertTest, TestMJPGInfo) { EXPECT_EQ(1, ShowJPegInfo(kTest0Jpg, kTest0JpgLen)); EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen)); EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen)); EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen)); EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg, kTest4JpgLen)); // Valid but unsupported. } #endif // HAVE_JPEG TEST_F(LibYUVConvertTest, NV12Crop) { const int SUBSAMP_X = 2; const int SUBSAMP_Y = 2; const int kWidth = benchmark_width_; const int kHeight = benchmark_height_; const int crop_y = ((benchmark_height_ - (benchmark_height_ * 360 / 480)) / 2 + 1) & ~1; const int kDestWidth = benchmark_width_; const int kDestHeight = benchmark_height_ - crop_y * 2; const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); const int sample_size = kWidth * kHeight + kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; align_buffer_page_end(src_y, sample_size); uint8_t* src_uv = src_y + kWidth * kHeight; align_buffer_page_end(dst_y, kDestWidth * kDestHeight); align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight); align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); for (int i = 0; i < kHeight * kWidth; ++i) { src_y[i] = (fastrand() & 0xff); } for (int i = 0; i < (SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideUV) * 2; ++i) { src_uv[i] = (fastrand() & 0xff); } memset(dst_y, 1, kDestWidth * kDestHeight); memset(dst_u, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); memset(dst_v, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); memset(dst_y_2, 1, kDestWidth * kDestHeight); memset(dst_u_2, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); memset(dst_v_2, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); ConvertToI420(src_y, sample_size, dst_y_2, kDestWidth, dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight, kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_NV12); NV12ToI420(src_y + crop_y * kWidth, kWidth, src_uv + (crop_y / 2) * kStrideUV * 2, kStrideUV * 2, dst_y, kDestWidth, dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X), kDestWidth, kDestHeight); for (int i = 0; i < kDestHeight; ++i) { for (int j = 0; j < kDestWidth; ++j) { EXPECT_EQ(dst_y[i * kWidth + j], dst_y_2[i * kWidth + j]); } } for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) { for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) { EXPECT_EQ(dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j], dst_u_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]); } } for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) { for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) { EXPECT_EQ(dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j], dst_v_2[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]); } } free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_u); free_aligned_buffer_page_end(dst_v); free_aligned_buffer_page_end(dst_y_2); free_aligned_buffer_page_end(dst_u_2); free_aligned_buffer_page_end(dst_v_2); free_aligned_buffer_page_end(src_y); } TEST_F(LibYUVConvertTest, I420CropOddY) { const int SUBSAMP_X = 2; const int SUBSAMP_Y = 2; const int kWidth = benchmark_width_; const int kHeight = benchmark_height_; const int crop_y = 1; const int kDestWidth = benchmark_width_; const int kDestHeight = benchmark_height_ - crop_y * 2; const int kStrideU = SUBSAMPLE(kWidth, SUBSAMP_X); const int kStrideV = SUBSAMPLE(kWidth, SUBSAMP_X); const int sample_size = kWidth * kHeight + kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y) + kStrideV * SUBSAMPLE(kHeight, SUBSAMP_Y); align_buffer_page_end(src_y, sample_size); uint8_t* src_u = src_y + kWidth * kHeight; uint8_t* src_v = src_u + kStrideU * SUBSAMPLE(kHeight, SUBSAMP_Y); align_buffer_page_end(dst_y, kDestWidth * kDestHeight); align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); for (int i = 0; i < kHeight * kWidth; ++i) { src_y[i] = (fastrand() & 0xff); } for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideU; ++i) { src_u[i] = (fastrand() & 0xff); } for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y) * kStrideV; ++i) { src_v[i] = (fastrand() & 0xff); } memset(dst_y, 1, kDestWidth * kDestHeight); memset(dst_u, 2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); memset(dst_v, 3, SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { ConvertToI420(src_y, sample_size, dst_y, kDestWidth, dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X), dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X), 0, crop_y, kWidth, kHeight, kDestWidth, kDestHeight, libyuv::kRotate0, libyuv::FOURCC_I420); } for (int i = 0; i < kDestHeight; ++i) { for (int j = 0; j < kDestWidth; ++j) { EXPECT_EQ(src_y[crop_y * kWidth + i * kWidth + j], dst_y[i * kDestWidth + j]); } } for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) { for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) { EXPECT_EQ(src_u[(crop_y / 2 + i) * kStrideU + j], dst_u[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]); } } for (int i = 0; i < SUBSAMPLE(kDestHeight, SUBSAMP_Y); ++i) { for (int j = 0; j < SUBSAMPLE(kDestWidth, SUBSAMP_X); ++j) { EXPECT_EQ(src_v[(crop_y / 2 + i) * kStrideV + j], dst_v[i * SUBSAMPLE(kDestWidth, SUBSAMP_X) + j]); } } free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_u); free_aligned_buffer_page_end(dst_v); free_aligned_buffer_page_end(src_y); } TEST_F(LibYUVConvertTest, TestYToARGB) { uint8_t y[32]; uint8_t expectedg[32]; for (int i = 0; i < 32; ++i) { y[i] = i * 5 + 17; expectedg[i] = static_cast((y[i] - 16) * 1.164f + 0.5f); } uint8_t argb[32 * 4]; YToARGB(y, 0, argb, 0, 32, 1); for (int i = 0; i < 32; ++i) { printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i], argb[i * 4 + 0], argb[i * 4 + 1], argb[i * 4 + 2], argb[i * 4 + 3]); } for (int i = 0; i < 32; ++i) { EXPECT_EQ(expectedg[i], argb[i * 4 + 0]); } } static const uint8_t kNoDither4x4[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; TEST_F(LibYUVConvertTest, TestNoDither) { align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4); align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2); align_buffer_page_end(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2); MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4); MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2); MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2); ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2, benchmark_width_, benchmark_height_); ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither, benchmark_width_ * 2, kNoDither4x4, benchmark_width_, benchmark_height_); for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) { EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]); } free_aligned_buffer_page_end(src_argb); free_aligned_buffer_page_end(dst_rgb565); free_aligned_buffer_page_end(dst_rgb565dither); } // Ordered 4x4 dither for 888 to 565. Values from 0 to 7. static const uint8_t kDither565_4x4[16] = { 0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2, }; TEST_F(LibYUVConvertTest, TestDither) { align_buffer_page_end(src_argb, benchmark_width_ * benchmark_height_ * 4); align_buffer_page_end(dst_rgb565, benchmark_width_ * benchmark_height_ * 2); align_buffer_page_end(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2); align_buffer_page_end(dst_argb, benchmark_width_ * benchmark_height_ * 4); align_buffer_page_end(dst_argbdither, benchmark_width_ * benchmark_height_ * 4); MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4); MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2); MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2); MemRandomize(dst_argb, benchmark_width_ * benchmark_height_ * 4); MemRandomize(dst_argbdither, benchmark_width_ * benchmark_height_ * 4); ARGBToRGB565(src_argb, benchmark_width_ * 4, dst_rgb565, benchmark_width_ * 2, benchmark_width_, benchmark_height_); ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, dst_rgb565dither, benchmark_width_ * 2, kDither565_4x4, benchmark_width_, benchmark_height_); RGB565ToARGB(dst_rgb565, benchmark_width_ * 2, dst_argb, benchmark_width_ * 4, benchmark_width_, benchmark_height_); RGB565ToARGB(dst_rgb565dither, benchmark_width_ * 2, dst_argbdither, benchmark_width_ * 4, benchmark_width_, benchmark_height_); for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) { EXPECT_NEAR(dst_argb[i], dst_argbdither[i], 9); } free_aligned_buffer_page_end(src_argb); free_aligned_buffer_page_end(dst_rgb565); free_aligned_buffer_page_end(dst_rgb565dither); free_aligned_buffer_page_end(dst_argb); free_aligned_buffer_page_end(dst_argbdither); } #define TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, W1280, N, NEG, OFF, FMT_C, BPP_C) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##Dither##N) { \ const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ align_buffer_page_end(src_u, kSizeUV + OFF); \ align_buffer_page_end(src_v, kSizeUV + OFF); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y[i + OFF] = (fastrand() & 0xff); \ } \ for (int i = 0; i < kSizeUV; ++i) { \ src_u[i + OFF] = (fastrand() & 0xff); \ src_v[i + OFF] = (fastrand() & 0xff); \ } \ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_PLANAR##To##FMT_B##Dither(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ src_v + OFF, kStrideUV, dst_argb_c + OFF, \ kStrideB, NULL, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_PLANAR##To##FMT_B##Dither( \ src_y + OFF, kWidth, src_u + OFF, kStrideUV, src_v + OFF, kStrideUV, \ dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight); \ } \ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \ align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \ memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight); \ memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight); \ FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \ kWidth, kHeight); \ FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \ kWidth * BPP_C, kWidth, kHeight); \ for (int i = 0; i < kWidth * BPP_C * kHeight; ++i) { \ EXPECT_EQ(dst_argb32_c[i], dst_argb32_opt[i]); \ } \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ free_aligned_buffer_page_end(dst_argb32_c); \ free_aligned_buffer_page_end(dst_argb32_opt); \ } #define TESTPLANARTOBD(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, FMT_C, BPP_C) \ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C) \ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Unaligned, +, 2, FMT_C, BPP_C) \ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \ TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Opt, +, 0, FMT_C, BPP_C) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4) #endif #define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12) \ TEST_F(LibYUVConvertTest, NAME) { \ const int kWidth = benchmark_width_; \ const int kHeight = benchmark_height_; \ \ align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \ align_buffer_page_end(orig_y, kWidth* kHeight); \ align_buffer_page_end(orig_u, \ SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ align_buffer_page_end(orig_v, \ SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ \ align_buffer_page_end(dst_y_orig, kWidth* kHeight); \ align_buffer_page_end(dst_uv_orig, \ 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ \ align_buffer_page_end(dst_y, kWidth* kHeight); \ align_buffer_page_end(dst_uv, \ 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ \ MemRandomize(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \ \ /* Convert UYVY to NV12 in 2 steps for reference */ \ libyuv::UYVYTOI420(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), orig_y, kWidth, \ orig_u, SUBSAMPLE(kWidth, 2), orig_v, \ SUBSAMPLE(kWidth, 2), kWidth, kHeight); \ libyuv::I420ToNV12(orig_y, kWidth, orig_u, SUBSAMPLE(kWidth, 2), orig_v, \ SUBSAMPLE(kWidth, 2), dst_y_orig, kWidth, dst_uv_orig, \ 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \ \ /* Convert to NV12 */ \ for (int i = 0; i < benchmark_iterations_; ++i) { \ libyuv::UYVYTONV12(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2), dst_y, kWidth, \ dst_uv, 2 * SUBSAMPLE(kWidth, 2), kWidth, kHeight); \ } \ \ for (int i = 0; i < kWidth * kHeight; ++i) { \ EXPECT_EQ(orig_y[i], dst_y[i]); \ } \ for (int i = 0; i < kWidth * kHeight; ++i) { \ EXPECT_EQ(dst_y_orig[i], dst_y[i]); \ } \ for (int i = 0; i < 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2); \ ++i) { \ EXPECT_EQ(dst_uv_orig[i], dst_uv[i]); \ } \ \ free_aligned_buffer_page_end(orig_uyvy); \ free_aligned_buffer_page_end(orig_y); \ free_aligned_buffer_page_end(orig_u); \ free_aligned_buffer_page_end(orig_v); \ free_aligned_buffer_page_end(dst_y_orig); \ free_aligned_buffer_page_end(dst_uv_orig); \ free_aligned_buffer_page_end(dst_y); \ free_aligned_buffer_page_end(dst_uv); \ } TESTPTOB(TestYUY2ToNV12, YUY2ToI420, YUY2ToNV12) TESTPTOB(TestUYVYToNV12, UYVYToI420, UYVYToNV12) // Transitive test. A to B to C is same as A to C. // Benchmarks A To B to C for comparison to 1 step, benchmarked elsewhere. #define TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ W1280, N, NEG, OFF, FMT_C, BPP_C) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##To##FMT_C##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ align_buffer_page_end(src_u, kSizeUV + OFF); \ align_buffer_page_end(src_v, kSizeUV + OFF); \ align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y[i + OFF] = (fastrand() & 0xff); \ } \ for (int i = 0; i < kSizeUV; ++i) { \ src_u[i + OFF] = (fastrand() & 0xff); \ src_v[i + OFF] = (fastrand() & 0xff); \ } \ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ src_v + OFF, kStrideUV, dst_argb_b + OFF, kStrideB, \ kWidth, NEG kHeight); \ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \ const int kStrideC = kWidth * BPP_C; \ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_PLANAR##To##FMT_C(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \ src_v + OFF, kStrideUV, dst_argb_c + OFF, \ kStrideC, kWidth, NEG kHeight); \ /* Convert B to C */ \ FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, \ kStrideC, kWidth, kHeight); \ } \ for (int i = 0; i < kStrideC * kHeight; ++i) { \ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \ } \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ free_aligned_buffer_page_end(dst_argb_b); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_bc); \ } #if defined(ENABLE_FULL_TESTS) #define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ FMT_C, BPP_C) \ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C) \ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Unaligned, +, 2, FMT_C, BPP_C) \ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Invert, -, 0, FMT_C, BPP_C) \ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C) #else #define TESTPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ FMT_C, BPP_C) \ TESTPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C) #endif #if defined(ENABLE_FULL_TESTS) TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ABGR, 4) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB24, 3) TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4) TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4) TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3) TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4) TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3) TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4) TESTPLANARTOE(H420, 2, 2, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, ABGR, 4) TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3) TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RGB24, 3) TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4) TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3) TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, ARGB, 4) TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3) TESTPLANARTOE(J420, 2, 2, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(J420, 2, 2, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(U420, 2, 2, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(U420, 2, 2, ARGB, 1, 4, ARGB, 4) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2) TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2) #endif TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, ABGR, 4) TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(U422, 2, 1, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(U422, 2, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(V422, 2, 1, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(V422, 2, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4) TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ABGR, 4) TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(J444, 1, 1, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(J444, 1, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(H444, 1, 1, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(H444, 1, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(U444, 1, 1, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(U444, 1, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(V444, 1, 1, ARGB, 1, 4, ARGB, 4) TESTPLANARTOE(V444, 1, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4) TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4) #else TESTPLANARTOE(I420, 2, 2, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RAW, 3) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB24, 3) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2) TESTPLANARTOE(I420, 2, 2, BGRA, 1, 4, ARGB, 4) TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, ARGB, 4) TESTPLANARTOE(I420, 2, 2, RAW, 1, 3, RGB24, 3) TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, ARGB, 4) TESTPLANARTOE(I420, 2, 2, RGB24, 1, 3, RAW, 3) TESTPLANARTOE(I420, 2, 2, RGBA, 1, 4, ARGB, 4) TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4) TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, ABGR, 1, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2) TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4) TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4) TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4) #endif // Transitive test: Compare 1 step vs 2 step conversion for YUVA to ARGB. // Benchmark 2 step conversion for comparison to 1 step conversion. #define TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ W1280, N, NEG, OFF, FMT_C, BPP_C, ATTEN) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##To##FMT_C##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ const int kSizeUV = \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ align_buffer_page_end(src_u, kSizeUV + OFF); \ align_buffer_page_end(src_v, kSizeUV + OFF); \ align_buffer_page_end(src_a, kWidth* kHeight + OFF); \ align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \ const int kStrideC = kWidth * BPP_C; \ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y[i + OFF] = (fastrand() & 0xff); \ src_a[i + OFF] = (fastrand() & 0xff); \ } \ for (int i = 0; i < kSizeUV; ++i) { \ src_u[i + OFF] = (fastrand() & 0xff); \ src_v[i + OFF] = (fastrand() & 0xff); \ } \ for (int i = 0; i < benchmark_iterations_; ++i) { \ /* Convert A to B */ \ FMT_PLANAR##To##FMT_B( \ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \ dst_argb_b + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \ /* Convert B to C */ \ FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, \ kStrideC, kWidth, kHeight); \ } \ /* Convert A to C */ \ FMT_PLANAR##To##FMT_C( \ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SUBSAMP_X), src_a + OFF, kWidth, \ dst_argb_c + OFF, kStrideC, kWidth, NEG kHeight, ATTEN); \ for (int i = 0; i < kStrideC * kHeight; ++i) { \ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_bc[i + OFF]); \ } \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ free_aligned_buffer_page_end(src_a); \ free_aligned_buffer_page_end(dst_argb_b); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_bc); \ } #if defined(ENABLE_FULL_TESTS) #define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ FMT_C, BPP_C) \ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C, 0) \ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Unaligned, +, 2, FMT_C, BPP_C, 0) \ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Invert, -, 0, FMT_C, BPP_C, 0) \ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0) \ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1) #else #define TESTQPLANARTOE(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ FMT_C, BPP_C) \ TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Opt, +, 0, FMT_C, BPP_C, 0) #endif #if defined(ENABLE_FULL_TESTS) TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(J420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(J420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(H420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(H420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(F420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(F420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(U420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(U420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(V420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(V420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(I422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(I422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(J422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(J422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(F422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(F422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(H422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(H422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(U422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(U422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(V422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(V422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(I444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(J444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(J444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(H444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(H444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(U444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(U444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(V444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4) #else TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(I422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4) #endif #define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \ OFF, FMT_C, BPP_C) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##To##FMT_C##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF); \ align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \ MemRandomize(src_argb_a + OFF, kStrideA * kHeight); \ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB, \ kWidth, NEG kHeight); \ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \ const int kStrideC = kWidth * BPP_C; \ align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \ align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_A##To##FMT_C(src_argb_a + OFF, kStrideA, dst_argb_c + OFF, kStrideC, \ kWidth, NEG kHeight); \ /* Convert B to C */ \ FMT_B##To##FMT_C(dst_argb_b + OFF, kStrideB, dst_argb_bc + OFF, \ kStrideC, kWidth, kHeight); \ } \ for (int i = 0; i < kStrideC * kHeight; i += 4) { \ EXPECT_EQ(dst_argb_c[i + OFF + 0], dst_argb_bc[i + OFF + 0]); \ EXPECT_EQ(dst_argb_c[i + OFF + 1], dst_argb_bc[i + OFF + 1]); \ EXPECT_EQ(dst_argb_c[i + OFF + 2], dst_argb_bc[i + OFF + 2]); \ EXPECT_NEAR(dst_argb_c[i + OFF + 3], dst_argb_bc[i + OFF + 3], 64); \ } \ free_aligned_buffer_page_end(src_argb_a); \ free_aligned_buffer_page_end(dst_argb_b); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_bc); \ } #define TESTPLANETOE(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, FMT_C, BPP_C) \ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, \ benchmark_width_ + 1, _Any, +, 0, FMT_C, BPP_C) \ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \ _Unaligned, +, 4, FMT_C, BPP_C) \ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \ _Invert, -, 0, FMT_C, BPP_C) \ TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, benchmark_width_, \ _Opt, +, 0, FMT_C, BPP_C) // Caveat: Destination needs to be 4 bytes #ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4) TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4) TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4) TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4) TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4) TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4) TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4) TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4) #endif TEST_F(LibYUVConvertTest, RotateWithARGBSource) { // 2x2 frames uint32_t src[4]; uint32_t dst[4]; // some random input src[0] = 0x11000000; src[1] = 0x00450000; src[2] = 0x00009f00; src[3] = 0x000000ff; // zeros on destination dst[0] = 0x00000000; dst[1] = 0x00000000; dst[2] = 0x00000000; dst[3] = 0x00000000; int r = ConvertToARGB(reinterpret_cast(src), 16, // input size reinterpret_cast(dst), 8, // destination stride 0, // crop_x 0, // crop_y 2, // width 2, // height 2, // crop width 2, // crop height kRotate90, FOURCC_ARGB); EXPECT_EQ(r, 0); // 90 degrees rotation, no conversion EXPECT_EQ(dst[0], src[2]); EXPECT_EQ(dst[1], src[0]); EXPECT_EQ(dst[2], src[3]); EXPECT_EQ(dst[3], src[1]); } #ifdef HAS_ARGBTOAR30ROW_AVX2 TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { // ARGBToAR30Row_AVX2 expects a multiple of 8 pixels. const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7; align_buffer_page_end(src, kPixels * 4); align_buffer_page_end(dst_opt, kPixels * 4); align_buffer_page_end(dst_c, kPixels * 4); MemRandomize(src, kPixels * 4); memset(dst_opt, 0, kPixels * 4); memset(dst_c, 1, kPixels * 4); ARGBToAR30Row_C(src, dst_c, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { ARGBToAR30Row_AVX2(src, dst_opt, kPixels); } else if (has_ssse3) { ARGBToAR30Row_SSSE3(src, dst_opt, kPixels); } else { ARGBToAR30Row_C(src, dst_opt, kPixels); } } for (int i = 0; i < kPixels * 4; ++i) { EXPECT_EQ(dst_opt[i], dst_c[i]); } free_aligned_buffer_page_end(src); free_aligned_buffer_page_end(dst_opt); free_aligned_buffer_page_end(dst_c); } #endif // HAS_ARGBTOAR30ROW_AVX2 #ifdef HAS_ABGRTOAR30ROW_AVX2 TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { // ABGRToAR30Row_AVX2 expects a multiple of 8 pixels. const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7; align_buffer_page_end(src, kPixels * 4); align_buffer_page_end(dst_opt, kPixels * 4); align_buffer_page_end(dst_c, kPixels * 4); MemRandomize(src, kPixels * 4); memset(dst_opt, 0, kPixels * 4); memset(dst_c, 1, kPixels * 4); ABGRToAR30Row_C(src, dst_c, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { ABGRToAR30Row_AVX2(src, dst_opt, kPixels); } else if (has_ssse3) { ABGRToAR30Row_SSSE3(src, dst_opt, kPixels); } else { ABGRToAR30Row_C(src, dst_opt, kPixels); } } for (int i = 0; i < kPixels * 4; ++i) { EXPECT_EQ(dst_opt[i], dst_c[i]); } free_aligned_buffer_page_end(src); free_aligned_buffer_page_end(dst_opt); free_aligned_buffer_page_end(dst_c); } #endif // HAS_ABGRTOAR30ROW_AVX2 // Provide matrix wrappers for 12 bit YUV #define I012ToARGB(a, b, c, d, e, f, g, h, i, j) \ I012ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) #define I012ToAR30(a, b, c, d, e, f, g, h, i, j) \ I012ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) #define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \ I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) #define I410ToABGR(a, b, c, d, e, f, g, h, i, j) \ I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) #define H410ToARGB(a, b, c, d, e, f, g, h, i, j) \ I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) #define H410ToABGR(a, b, c, d, e, f, g, h, i, j) \ I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) #define U410ToARGB(a, b, c, d, e, f, g, h, i, j) \ I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) #define U410ToABGR(a, b, c, d, e, f, g, h, i, j) \ I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) #define I410ToAR30(a, b, c, d, e, f, g, h, i, j) \ I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) #define I410ToAB30(a, b, c, d, e, f, g, h, i, j) \ I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) #define H410ToAR30(a, b, c, d, e, f, g, h, i, j) \ I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) #define H410ToAB30(a, b, c, d, e, f, g, h, i, j) \ I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) #define U410ToAR30(a, b, c, d, e, f, g, h, i, j) \ I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) #define U410ToAB30(a, b, c, d, e, f, g, h, i, j) \ I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) // TODO(fbarchard): Fix clamping issue affected by U channel. #define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, \ BPP_B, ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ const int kBpc = 2; \ align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \ align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ reinterpret_cast(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \ } \ for (int i = 0; i < kSizeUV; ++i) { \ reinterpret_cast(src_u + SOFF)[i] = (fastrand() & FMT_MASK); \ reinterpret_cast(src_v + SOFF)[i] = (fastrand() & FMT_MASK); \ } \ memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \ memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_PLANAR##To##FMT_B( \ reinterpret_cast(src_y + SOFF), kWidth, \ reinterpret_cast(src_u + SOFF), kStrideUV, \ reinterpret_cast(src_v + SOFF), kStrideUV, \ dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_PLANAR##To##FMT_B( \ reinterpret_cast(src_y + SOFF), kWidth, \ reinterpret_cast(src_u + SOFF), kStrideUV, \ reinterpret_cast(src_v + SOFF), kStrideUV, \ dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \ } \ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \ } \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, \ BPP_B, ALIGN, YALIGN) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ ALIGN, YALIGN, benchmark_width_ + 1, _Any, +, 0, 0) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ ALIGN, YALIGN, benchmark_width_, _Unaligned, +, 4, 4) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ ALIGN, YALIGN, benchmark_width_, _Invert, -, 0, 0) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_MASK, FMT_B, BPP_B, \ ALIGN, YALIGN, benchmark_width_, _Opt, +, 0, 0) // These conversions are only optimized for x86 #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ARGB, 4, 4, 1) TESTPLANAR16TOB(I010, 2, 2, 0x3ff, ABGR, 4, 4, 1) TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ARGB, 4, 4, 1) TESTPLANAR16TOB(H010, 2, 2, 0x3ff, ABGR, 4, 4, 1) TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ARGB, 4, 4, 1) TESTPLANAR16TOB(U010, 2, 2, 0x3ff, ABGR, 4, 4, 1) TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ARGB, 4, 4, 1) TESTPLANAR16TOB(I210, 2, 1, 0x3ff, ABGR, 4, 4, 1) TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ARGB, 4, 4, 1) TESTPLANAR16TOB(H210, 2, 1, 0x3ff, ABGR, 4, 4, 1) TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ARGB, 4, 4, 1) TESTPLANAR16TOB(U210, 2, 1, 0x3ff, ABGR, 4, 4, 1) TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ARGB, 4, 4, 1) TESTPLANAR16TOB(I410, 1, 1, 0x3ff, ABGR, 4, 4, 1) TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ARGB, 4, 4, 1) TESTPLANAR16TOB(H410, 1, 1, 0x3ff, ABGR, 4, 4, 1) TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ARGB, 4, 4, 1) TESTPLANAR16TOB(U410, 1, 1, 0x3ff, ABGR, 4, 4, 1) TESTPLANAR16TOB(I012, 2, 2, 0xfff, ARGB, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AR30, 4, 4, 1) TESTPLANAR16TOB(I010, 2, 2, 0x3ff, AB30, 4, 4, 1) TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AR30, 4, 4, 1) TESTPLANAR16TOB(H010, 2, 2, 0x3ff, AB30, 4, 4, 1) TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AR30, 4, 4, 1) TESTPLANAR16TOB(U010, 2, 2, 0x3ff, AB30, 4, 4, 1) TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30, 4, 4, 1) TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AB30, 4, 4, 1) TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AR30, 4, 4, 1) TESTPLANAR16TOB(H210, 2, 1, 0x3ff, AB30, 4, 4, 1) TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AR30, 4, 4, 1) TESTPLANAR16TOB(U210, 2, 1, 0x3ff, AB30, 4, 4, 1) TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AR30, 4, 4, 1) TESTPLANAR16TOB(I410, 1, 1, 0x3ff, AB30, 4, 4, 1) TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AR30, 4, 4, 1) TESTPLANAR16TOB(H410, 1, 1, 0x3ff, AB30, 4, 4, 1) TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AR30, 4, 4, 1) TESTPLANAR16TOB(U410, 1, 1, 0x3ff, AB30, 4, 4, 1) TESTPLANAR16TOB(I012, 2, 2, 0xfff, AR30, 4, 4, 1) #endif // LITTLE_ENDIAN_ONLY_TEST #endif // DISABLE_SLOW_TESTS #define TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ ALIGN, YALIGN, W1280, N, NEG, OFF, ATTEN, S_DEPTH) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ const int kBpc = 2; \ align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF); \ align_buffer_page_end(src_u, kSizeUV* kBpc + OFF); \ align_buffer_page_end(src_v, kSizeUV* kBpc + OFF); \ align_buffer_page_end(src_a, kWidth* kHeight* kBpc + OFF); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ reinterpret_cast(src_y + OFF)[i] = \ (fastrand() & ((1 << S_DEPTH) - 1)); \ reinterpret_cast(src_a + OFF)[i] = \ (fastrand() & ((1 << S_DEPTH) - 1)); \ } \ for (int i = 0; i < kSizeUV; ++i) { \ reinterpret_cast(src_u + OFF)[i] = \ (fastrand() & ((1 << S_DEPTH) - 1)); \ reinterpret_cast(src_v + OFF)[i] = \ (fastrand() & ((1 << S_DEPTH) - 1)); \ } \ memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_PLANAR##To##FMT_B(reinterpret_cast(src_y + OFF), kWidth, \ reinterpret_cast(src_u + OFF), kStrideUV, \ reinterpret_cast(src_v + OFF), kStrideUV, \ reinterpret_cast(src_a + OFF), kWidth, \ dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight, \ ATTEN); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_PLANAR##To##FMT_B( \ reinterpret_cast(src_y + OFF), kWidth, \ reinterpret_cast(src_u + OFF), kStrideUV, \ reinterpret_cast(src_v + OFF), kStrideUV, \ reinterpret_cast(src_a + OFF), kWidth, \ dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \ } \ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \ } \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_u); \ free_aligned_buffer_page_end(src_v); \ free_aligned_buffer_page_end(src_a); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } #if defined(ENABLE_FULL_TESTS) #define TESTQPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ ALIGN, YALIGN, S_DEPTH) \ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH) \ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Unaligned, +, 2, 0, S_DEPTH) \ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) \ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Premult, +, 0, 1, S_DEPTH) #else #define TESTQPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ ALIGN, YALIGN, S_DEPTH) \ TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) #endif #define I010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ l, m) #define I010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ l, m) #define J010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define J010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define F010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define F010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define H010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define H010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define U010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define U010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define V010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define V010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define I210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ l, m) #define I210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ l, m) #define J210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define J210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define F210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define F210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define H210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define H210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define U210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define U210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define V210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define V210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define I410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ l, m) #define I410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ l, m) #define J410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define J410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ l, m) #define F410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define F410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ l, m) #define H410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define H410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ l, m) #define U410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define U410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ l, m) #define V410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) #define V410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ l, m) // These conversions are only optimized for x86 #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) TESTQPLANAR16TOB(I010Alpha, 2, 2, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(I010Alpha, 2, 2, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(J010Alpha, 2, 2, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(J010Alpha, 2, 2, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(H010Alpha, 2, 2, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(H010Alpha, 2, 2, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(F010Alpha, 2, 2, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(F010Alpha, 2, 2, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(U010Alpha, 2, 2, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(U010Alpha, 2, 2, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(V010Alpha, 2, 2, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(V010Alpha, 2, 2, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(I210Alpha, 2, 1, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(J210Alpha, 2, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(J210Alpha, 2, 1, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(H210Alpha, 2, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(H210Alpha, 2, 1, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(F210Alpha, 2, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(F210Alpha, 2, 1, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(U210Alpha, 2, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(U210Alpha, 2, 1, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(V210Alpha, 2, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(V210Alpha, 2, 1, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(I410Alpha, 1, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(I410Alpha, 1, 1, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(J410Alpha, 1, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(J410Alpha, 1, 1, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(H410Alpha, 1, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(H410Alpha, 1, 1, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(F410Alpha, 1, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(F410Alpha, 1, 1, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(U410Alpha, 1, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(U410Alpha, 1, 1, ABGR, 4, 4, 1, 10) TESTQPLANAR16TOB(V410Alpha, 1, 1, ARGB, 4, 4, 1, 10) TESTQPLANAR16TOB(V410Alpha, 1, 1, ABGR, 4, 4, 1, 10) #endif // DISABLE_SLOW_TESTS #define TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = W1280; \ const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2; \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; \ const int kBpc = 2; \ align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ align_buffer_page_end(src_uv, kSizeUV* kBpc + SOFF); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ reinterpret_cast(src_y + SOFF)[i] = \ (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH))); \ } \ for (int i = 0; i < kSizeUV; ++i) { \ reinterpret_cast(src_uv + SOFF)[i] = \ (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH))); \ } \ memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \ memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_PLANAR##To##FMT_B(reinterpret_cast(src_y + SOFF), kWidth, \ reinterpret_cast(src_uv + SOFF), \ kStrideUV, dst_argb_c + DOFF, kStrideB, kWidth, \ NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FMT_PLANAR##To##FMT_B(reinterpret_cast(src_y + SOFF), kWidth, \ reinterpret_cast(src_uv + SOFF), \ kStrideUV, dst_argb_opt + DOFF, kStrideB, kWidth, \ NEG kHeight); \ } \ for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \ } \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_uv); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } #define TESTBIPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ ALIGN, YALIGN, S_DEPTH) \ TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_ + 1, _Any, +, 0, 0, S_DEPTH) \ TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Unaligned, +, 4, 4, S_DEPTH) \ TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \ TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) #define P010ToARGB(a, b, c, d, e, f, g, h) \ P010ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #define P210ToARGB(a, b, c, d, e, f, g, h) \ P210ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #define P010ToAR30(a, b, c, d, e, f, g, h) \ P010ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #define P210ToAR30(a, b, c, d, e, f, g, h) \ P210ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #define P012ToARGB(a, b, c, d, e, f, g, h) \ P012ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #define P212ToARGB(a, b, c, d, e, f, g, h) \ P212ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #define P012ToAR30(a, b, c, d, e, f, g, h) \ P012ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #define P212ToAR30(a, b, c, d, e, f, g, h) \ P212ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #define P016ToARGB(a, b, c, d, e, f, g, h) \ P016ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #define P216ToARGB(a, b, c, d, e, f, g, h) \ P216ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #define P016ToAR30(a, b, c, d, e, f, g, h) \ P016ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #define P216ToAR30(a, b, c, d, e, f, g, h) \ P216ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) TESTBIPLANAR16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10) TESTBIPLANAR16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10) TESTBIPLANAR16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12) TESTBIPLANAR16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12) TESTBIPLANAR16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16) TESTBIPLANAR16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16) #ifdef LITTLE_ENDIAN_ONLY_TEST TESTBIPLANAR16TOB(P010, 2, 2, AR30, 4, 4, 1, 10) TESTBIPLANAR16TOB(P210, 2, 1, AR30, 4, 4, 1, 10) TESTBIPLANAR16TOB(P012, 2, 2, AR30, 4, 4, 1, 12) TESTBIPLANAR16TOB(P212, 2, 1, AR30, 4, 4, 1, 12) TESTBIPLANAR16TOB(P016, 2, 2, AR30, 4, 4, 1, 16) TESTBIPLANAR16TOB(P216, 2, 1, AR30, 4, 4, 1, 16) #endif // LITTLE_ENDIAN_ONLY_TEST #endif // DISABLE_SLOW_TESTS static int Clamp(int y) { if (y < 0) { y = 0; } if (y > 255) { y = 255; } return y; } static int Clamp10(int y) { if (y < 0) { y = 0; } if (y > 1023) { y = 1023; } return y; } // Test 8 bit YUV to 8 bit RGB TEST_F(LibYUVConvertTest, TestH420ToARGB) { const int kSize = 256; int histogram_b[256]; int histogram_g[256]; int histogram_r[256]; memset(histogram_b, 0, sizeof(histogram_b)); memset(histogram_g, 0, sizeof(histogram_g)); memset(histogram_r, 0, sizeof(histogram_r)); align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2); align_buffer_page_end(argb_pixels, kSize * 4); uint8_t* orig_y = orig_yuv; uint8_t* orig_u = orig_y + kSize; uint8_t* orig_v = orig_u + kSize / 2; // Test grey scale for (int i = 0; i < kSize; ++i) { orig_y[i] = i; } for (int i = 0; i < kSize / 2; ++i) { orig_u[i] = 128; // 128 is 0. orig_v[i] = 128; } H420ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1); for (int i = 0; i < kSize; ++i) { int b = argb_pixels[i * 4 + 0]; int g = argb_pixels[i * 4 + 1]; int r = argb_pixels[i * 4 + 2]; int a = argb_pixels[i * 4 + 3]; ++histogram_b[b]; ++histogram_g[g]; ++histogram_r[r]; // Reference formula for Y channel contribution in YUV to RGB conversions: int expected_y = Clamp(static_cast((i - 16) * 1.164f + 0.5f)); EXPECT_EQ(b, expected_y); EXPECT_EQ(g, expected_y); EXPECT_EQ(r, expected_y); EXPECT_EQ(a, 255); } int count_b = 0; int count_g = 0; int count_r = 0; for (int i = 0; i < kSize; ++i) { if (histogram_b[i]) { ++count_b; } if (histogram_g[i]) { ++count_g; } if (histogram_r[i]) { ++count_r; } } printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r); free_aligned_buffer_page_end(orig_yuv); free_aligned_buffer_page_end(argb_pixels); } // Test 10 bit YUV to 8 bit RGB TEST_F(LibYUVConvertTest, TestH010ToARGB) { const int kSize = 1024; int histogram_b[1024]; int histogram_g[1024]; int histogram_r[1024]; memset(histogram_b, 0, sizeof(histogram_b)); memset(histogram_g, 0, sizeof(histogram_g)); memset(histogram_r, 0, sizeof(histogram_r)); align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2); align_buffer_page_end(argb_pixels, kSize * 4); uint16_t* orig_y = reinterpret_cast(orig_yuv); uint16_t* orig_u = orig_y + kSize; uint16_t* orig_v = orig_u + kSize / 2; // Test grey scale for (int i = 0; i < kSize; ++i) { orig_y[i] = i; } for (int i = 0; i < kSize / 2; ++i) { orig_u[i] = 512; // 512 is 0. orig_v[i] = 512; } H010ToARGB(orig_y, 0, orig_u, 0, orig_v, 0, argb_pixels, 0, kSize, 1); for (int i = 0; i < kSize; ++i) { int b = argb_pixels[i * 4 + 0]; int g = argb_pixels[i * 4 + 1]; int r = argb_pixels[i * 4 + 2]; int a = argb_pixels[i * 4 + 3]; ++histogram_b[b]; ++histogram_g[g]; ++histogram_r[r]; int expected_y = Clamp(static_cast((i - 64) * 1.164f / 4)); EXPECT_NEAR(b, expected_y, 1); EXPECT_NEAR(g, expected_y, 1); EXPECT_NEAR(r, expected_y, 1); EXPECT_EQ(a, 255); } int count_b = 0; int count_g = 0; int count_r = 0; for (int i = 0; i < kSize; ++i) { if (histogram_b[i]) { ++count_b; } if (histogram_g[i]) { ++count_g; } if (histogram_r[i]) { ++count_r; } } printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r); free_aligned_buffer_page_end(orig_yuv); free_aligned_buffer_page_end(argb_pixels); } // Test 10 bit YUV to 10 bit RGB // Caveat: Result is near due to float rounding in expected // result. TEST_F(LibYUVConvertTest, TestH010ToAR30) { const int kSize = 1024; int histogram_b[1024]; int histogram_g[1024]; int histogram_r[1024]; memset(histogram_b, 0, sizeof(histogram_b)); memset(histogram_g, 0, sizeof(histogram_g)); memset(histogram_r, 0, sizeof(histogram_r)); align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2); align_buffer_page_end(ar30_pixels, kSize * 4); uint16_t* orig_y = reinterpret_cast(orig_yuv); uint16_t* orig_u = orig_y + kSize; uint16_t* orig_v = orig_u + kSize / 2; // Test grey scale for (int i = 0; i < kSize; ++i) { orig_y[i] = i; } for (int i = 0; i < kSize / 2; ++i) { orig_u[i] = 512; // 512 is 0. orig_v[i] = 512; } H010ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1); for (int i = 0; i < kSize; ++i) { int b10 = reinterpret_cast(ar30_pixels)[i] & 1023; int g10 = (reinterpret_cast(ar30_pixels)[i] >> 10) & 1023; int r10 = (reinterpret_cast(ar30_pixels)[i] >> 20) & 1023; int a2 = (reinterpret_cast(ar30_pixels)[i] >> 30) & 3; ++histogram_b[b10]; ++histogram_g[g10]; ++histogram_r[r10]; int expected_y = Clamp10(static_cast((i - 64) * 1.164f + 0.5)); EXPECT_NEAR(b10, expected_y, 4); EXPECT_NEAR(g10, expected_y, 4); EXPECT_NEAR(r10, expected_y, 4); EXPECT_EQ(a2, 3); } int count_b = 0; int count_g = 0; int count_r = 0; for (int i = 0; i < kSize; ++i) { if (histogram_b[i]) { ++count_b; } if (histogram_g[i]) { ++count_g; } if (histogram_r[i]) { ++count_r; } } printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r); free_aligned_buffer_page_end(orig_yuv); free_aligned_buffer_page_end(ar30_pixels); } // Test 10 bit YUV to 10 bit RGB // Caveat: Result is near due to float rounding in expected // result. TEST_F(LibYUVConvertTest, TestH010ToAB30) { const int kSize = 1024; int histogram_b[1024]; int histogram_g[1024]; int histogram_r[1024]; memset(histogram_b, 0, sizeof(histogram_b)); memset(histogram_g, 0, sizeof(histogram_g)); memset(histogram_r, 0, sizeof(histogram_r)); align_buffer_page_end(orig_yuv, kSize * 2 + kSize / 2 * 2 * 2); align_buffer_page_end(ab30_pixels, kSize * 4); uint16_t* orig_y = reinterpret_cast(orig_yuv); uint16_t* orig_u = orig_y + kSize; uint16_t* orig_v = orig_u + kSize / 2; // Test grey scale for (int i = 0; i < kSize; ++i) { orig_y[i] = i; } for (int i = 0; i < kSize / 2; ++i) { orig_u[i] = 512; // 512 is 0. orig_v[i] = 512; } H010ToAB30(orig_y, 0, orig_u, 0, orig_v, 0, ab30_pixels, 0, kSize, 1); for (int i = 0; i < kSize; ++i) { int r10 = reinterpret_cast(ab30_pixels)[i] & 1023; int g10 = (reinterpret_cast(ab30_pixels)[i] >> 10) & 1023; int b10 = (reinterpret_cast(ab30_pixels)[i] >> 20) & 1023; int a2 = (reinterpret_cast(ab30_pixels)[i] >> 30) & 3; ++histogram_b[b10]; ++histogram_g[g10]; ++histogram_r[r10]; int expected_y = Clamp10(static_cast((i - 64) * 1.164f)); EXPECT_NEAR(b10, expected_y, 4); EXPECT_NEAR(g10, expected_y, 4); EXPECT_NEAR(r10, expected_y, 4); EXPECT_EQ(a2, 3); } int count_b = 0; int count_g = 0; int count_r = 0; for (int i = 0; i < kSize; ++i) { if (histogram_b[i]) { ++count_b; } if (histogram_g[i]) { ++count_g; } if (histogram_r[i]) { ++count_r; } } printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r); free_aligned_buffer_page_end(orig_yuv); free_aligned_buffer_page_end(ab30_pixels); } // Test 8 bit YUV to 10 bit RGB TEST_F(LibYUVConvertTest, TestH420ToAR30) { const int kSize = 256; const int kHistSize = 1024; int histogram_b[kHistSize]; int histogram_g[kHistSize]; int histogram_r[kHistSize]; memset(histogram_b, 0, sizeof(histogram_b)); memset(histogram_g, 0, sizeof(histogram_g)); memset(histogram_r, 0, sizeof(histogram_r)); align_buffer_page_end(orig_yuv, kSize + kSize / 2 * 2); align_buffer_page_end(ar30_pixels, kSize * 4); uint8_t* orig_y = orig_yuv; uint8_t* orig_u = orig_y + kSize; uint8_t* orig_v = orig_u + kSize / 2; // Test grey scale for (int i = 0; i < kSize; ++i) { orig_y[i] = i; } for (int i = 0; i < kSize / 2; ++i) { orig_u[i] = 128; // 128 is 0. orig_v[i] = 128; } H420ToAR30(orig_y, 0, orig_u, 0, orig_v, 0, ar30_pixels, 0, kSize, 1); for (int i = 0; i < kSize; ++i) { int b10 = reinterpret_cast(ar30_pixels)[i] & 1023; int g10 = (reinterpret_cast(ar30_pixels)[i] >> 10) & 1023; int r10 = (reinterpret_cast(ar30_pixels)[i] >> 20) & 1023; int a2 = (reinterpret_cast(ar30_pixels)[i] >> 30) & 3; ++histogram_b[b10]; ++histogram_g[g10]; ++histogram_r[r10]; int expected_y = Clamp10(static_cast((i - 16) * 1.164f * 4.f)); EXPECT_NEAR(b10, expected_y, 4); EXPECT_NEAR(g10, expected_y, 4); EXPECT_NEAR(r10, expected_y, 4); EXPECT_EQ(a2, 3); } int count_b = 0; int count_g = 0; int count_r = 0; for (int i = 0; i < kHistSize; ++i) { if (histogram_b[i]) { ++count_b; } if (histogram_g[i]) { ++count_g; } if (histogram_r[i]) { ++count_r; } } printf("uniques: B %d, G, %d, R %d\n", count_b, count_g, count_r); free_aligned_buffer_page_end(orig_yuv); free_aligned_buffer_page_end(ar30_pixels); } // Test I400 with jpeg matrix is same as J400 TEST_F(LibYUVConvertTest, TestI400) { const int kSize = 256; align_buffer_page_end(orig_i400, kSize); align_buffer_page_end(argb_pixels_i400, kSize * 4); align_buffer_page_end(argb_pixels_j400, kSize * 4); align_buffer_page_end(argb_pixels_jpeg_i400, kSize * 4); align_buffer_page_end(argb_pixels_h709_i400, kSize * 4); align_buffer_page_end(argb_pixels_2020_i400, kSize * 4); // Test grey scale for (int i = 0; i < kSize; ++i) { orig_i400[i] = i; } J400ToARGB(orig_i400, 0, argb_pixels_j400, 0, kSize, 1); I400ToARGB(orig_i400, 0, argb_pixels_i400, 0, kSize, 1); I400ToARGBMatrix(orig_i400, 0, argb_pixels_jpeg_i400, 0, &kYuvJPEGConstants, kSize, 1); I400ToARGBMatrix(orig_i400, 0, argb_pixels_h709_i400, 0, &kYuvH709Constants, kSize, 1); I400ToARGBMatrix(orig_i400, 0, argb_pixels_2020_i400, 0, &kYuv2020Constants, kSize, 1); EXPECT_EQ(0, argb_pixels_i400[0]); EXPECT_EQ(0, argb_pixels_j400[0]); EXPECT_EQ(0, argb_pixels_jpeg_i400[0]); EXPECT_EQ(0, argb_pixels_h709_i400[0]); EXPECT_EQ(0, argb_pixels_2020_i400[0]); EXPECT_EQ(0, argb_pixels_i400[16 * 4]); EXPECT_EQ(16, argb_pixels_j400[16 * 4]); EXPECT_EQ(16, argb_pixels_jpeg_i400[16 * 4]); EXPECT_EQ(0, argb_pixels_h709_i400[16 * 4]); EXPECT_EQ(0, argb_pixels_2020_i400[16 * 4]); EXPECT_EQ(130, argb_pixels_i400[128 * 4]); EXPECT_EQ(128, argb_pixels_j400[128 * 4]); EXPECT_EQ(128, argb_pixels_jpeg_i400[128 * 4]); EXPECT_EQ(130, argb_pixels_h709_i400[128 * 4]); EXPECT_EQ(130, argb_pixels_2020_i400[128 * 4]); EXPECT_EQ(255, argb_pixels_i400[255 * 4]); EXPECT_EQ(255, argb_pixels_j400[255 * 4]); EXPECT_EQ(255, argb_pixels_jpeg_i400[255 * 4]); EXPECT_EQ(255, argb_pixels_h709_i400[255 * 4]); EXPECT_EQ(255, argb_pixels_2020_i400[255 * 4]); for (int i = 0; i < kSize * 4; ++i) { if ((i & 3) == 3) { EXPECT_EQ(255, argb_pixels_j400[i]); } else { EXPECT_EQ(i / 4, argb_pixels_j400[i]); } EXPECT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]); } free_aligned_buffer_page_end(orig_i400); free_aligned_buffer_page_end(argb_pixels_i400); free_aligned_buffer_page_end(argb_pixels_j400); free_aligned_buffer_page_end(argb_pixels_jpeg_i400); free_aligned_buffer_page_end(argb_pixels_h709_i400); free_aligned_buffer_page_end(argb_pixels_2020_i400); } // Test RGB24 to ARGB and back to RGB24 TEST_F(LibYUVConvertTest, TestARGBToRGB24) { const int kSize = 256; align_buffer_page_end(orig_rgb24, kSize * 3); align_buffer_page_end(argb_pixels, kSize * 4); align_buffer_page_end(dest_rgb24, kSize * 3); // Test grey scale for (int i = 0; i < kSize * 3; ++i) { orig_rgb24[i] = i; } RGB24ToARGB(orig_rgb24, 0, argb_pixels, 0, kSize, 1); ARGBToRGB24(argb_pixels, 0, dest_rgb24, 0, kSize, 1); for (int i = 0; i < kSize * 3; ++i) { EXPECT_EQ(orig_rgb24[i], dest_rgb24[i]); } free_aligned_buffer_page_end(orig_rgb24); free_aligned_buffer_page_end(argb_pixels); free_aligned_buffer_page_end(dest_rgb24); } TEST_F(LibYUVConvertTest, Test565) { SIMD_ALIGNED(uint8_t orig_pixels[256][4]); SIMD_ALIGNED(uint8_t pixels565[256][2]); for (int i = 0; i < 256; ++i) { for (int j = 0; j < 4; ++j) { orig_pixels[i][j] = i; } } ARGBToRGB565(&orig_pixels[0][0], 0, &pixels565[0][0], 0, 256, 1); uint32_t checksum = HashDjb2(&pixels565[0][0], sizeof(pixels565), 5381); EXPECT_EQ(610919429u, checksum); } // Test RGB24 to J420 is exact #if defined(LIBYUV_BIT_EXACT) TEST_F(LibYUVConvertTest, TestRGB24ToJ420) { const int kSize = 256; align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24 align_buffer_page_end(dest_j420, kSize * 3 / 2 * 2); int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) / (kSize * 2) * benchmark_iterations_; for (int i = 0; i < kSize * 3 * 2; ++i) { orig_rgb24[i] = i; } for (int i = 0; i < iterations256; ++i) { RGB24ToJ420(orig_rgb24, kSize * 3, dest_j420, kSize, // Y plane dest_j420 + kSize * 2, kSize / 2, // U plane dest_j420 + kSize * 5 / 2, kSize / 2, // V plane kSize, 2); } uint32_t checksum = HashDjb2(dest_j420, kSize * 3 / 2 * 2, 5381); EXPECT_EQ(2755440272u, checksum); free_aligned_buffer_page_end(orig_rgb24); free_aligned_buffer_page_end(dest_j420); } #endif // Test RGB24 to I420 is exact #if defined(LIBYUV_BIT_EXACT) TEST_F(LibYUVConvertTest, TestRGB24ToI420) { const int kSize = 256; align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24 align_buffer_page_end(dest_i420, kSize * 3 / 2 * 2); int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) / (kSize * 2) * benchmark_iterations_; for (int i = 0; i < kSize * 3 * 2; ++i) { orig_rgb24[i] = i; } for (int i = 0; i < iterations256; ++i) { RGB24ToI420(orig_rgb24, kSize * 3, dest_i420, kSize, // Y plane dest_i420 + kSize * 2, kSize / 2, // U plane dest_i420 + kSize * 5 / 2, kSize / 2, // V plane kSize, 2); } uint32_t checksum = HashDjb2(dest_i420, kSize * 3 / 2 * 2, 5381); EXPECT_EQ(1526656597u, checksum); free_aligned_buffer_page_end(orig_rgb24); free_aligned_buffer_page_end(dest_i420); } #endif } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/cpu_test.cc000066400000000000000000000212421416500237200217020ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include "../unit_test/unit_test.h" #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" #include "libyuv/version.h" namespace libyuv { TEST_F(LibYUVBaseTest, TestCpuHas) { int cpu_flags = TestCpuFlag(-1); printf("Cpu Flags %d\n", cpu_flags); #if defined(__arm__) || defined(__aarch64__) int has_arm = TestCpuFlag(kCpuHasARM); printf("Has ARM %d\n", has_arm); int has_neon = TestCpuFlag(kCpuHasNEON); printf("Has NEON %d\n", has_neon); #endif int has_x86 = TestCpuFlag(kCpuHasX86); int has_sse2 = TestCpuFlag(kCpuHasSSE2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); int has_sse41 = TestCpuFlag(kCpuHasSSE41); int has_sse42 = TestCpuFlag(kCpuHasSSE42); int has_avx = TestCpuFlag(kCpuHasAVX); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_erms = TestCpuFlag(kCpuHasERMS); int has_fma3 = TestCpuFlag(kCpuHasFMA3); int has_f16c = TestCpuFlag(kCpuHasF16C); int has_gfni = TestCpuFlag(kCpuHasGFNI); int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW); int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL); int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI); int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2); int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG); int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ); printf("Has X86 %d\n", has_x86); printf("Has SSE2 %d\n", has_sse2); printf("Has SSSE3 %d\n", has_ssse3); printf("Has SSE41 %d\n", has_sse41); printf("Has SSE42 %d\n", has_sse42); printf("Has AVX %d\n", has_avx); printf("Has AVX2 %d\n", has_avx2); printf("Has ERMS %d\n", has_erms); printf("Has FMA3 %d\n", has_fma3); printf("Has F16C %d\n", has_f16c); printf("Has GFNI %d\n", has_gfni); printf("Has AVX512BW %d\n", has_avx512bw); printf("Has AVX512VL %d\n", has_avx512vl); printf("Has AVX512VBMI %d\n", has_avx512vbmi); printf("Has AVX512VBMI2 %d\n", has_avx512vbmi2); printf("Has AVX512VBITALG %d\n", has_avx512vbitalg); printf("Has AVX512VPOPCNTDQ %d\n", has_avx512vpopcntdq); #if defined(__mips__) int has_mips = TestCpuFlag(kCpuHasMIPS); printf("Has MIPS %d\n", has_mips); int has_msa = TestCpuFlag(kCpuHasMSA); printf("Has MSA %d\n", has_msa); int has_mmi = TestCpuFlag(kCpuHasMMI); printf("Has MMI %d\n", has_mmi); #endif } TEST_F(LibYUVBaseTest, TestCompilerMacros) { // Tests all macros used in public headers. #ifdef __ATOMIC_RELAXED printf("__ATOMIC_RELAXED %d\n", __ATOMIC_RELAXED); #endif #ifdef __cplusplus printf("__cplusplus %ld\n", __cplusplus); #endif #ifdef __clang_major__ printf("__clang_major__ %d\n", __clang_major__); #endif #ifdef __clang_minor__ printf("__clang_minor__ %d\n", __clang_minor__); #endif #ifdef __GNUC__ printf("__GNUC__ %d\n", __GNUC__); #endif #ifdef __GNUC_MINOR__ printf("__GNUC_MINOR__ %d\n", __GNUC_MINOR__); #endif #ifdef __i386__ printf("__i386__ %d\n", __i386__); #endif #ifdef __mips printf("__mips %d\n", __mips); #endif #ifdef __mips_isa_rev printf("__mips_isa_rev %d\n", __mips_isa_rev); #endif #ifdef __x86_64__ printf("__x86_64__ %d\n", __x86_64__); #endif #ifdef _MSC_VER printf("_MSC_VER %d\n", _MSC_VER); #endif #ifdef __aarch64__ printf("__aarch64__ %d\n", __aarch64__); #endif #ifdef __APPLE__ printf("__APPLE__ %d\n", __APPLE__); #endif #ifdef __arm__ printf("__arm__ %d\n", __arm__); #endif #ifdef __clang__ printf("__clang__ %d\n", __clang__); #endif #ifdef __CLR_VER printf("__CLR_VER %d\n", __CLR_VER); #endif #ifdef __CYGWIN__ printf("__CYGWIN__ %d\n", __CYGWIN__); #endif #ifdef __llvm__ printf("__llvm__ %d\n", __llvm__); #endif #ifdef __mips_msa printf("__mips_msa %d\n", __mips_msa); #endif #ifdef __native_client__ printf("__native_client__ %d\n", __native_client__); #endif #ifdef __pic__ printf("__pic__ %d\n", __pic__); #endif #ifdef __pnacl__ printf("__pnacl__ %d\n", __pnacl__); #endif #ifdef _M_IX86 printf("_M_IX86 %d\n", _M_IX86); #endif #ifdef _M_X64 printf("_M_X64 %d\n", _M_X64); #endif #ifdef _MIPS_ARCH_LOONGSON3A printf("_MIPS_ARCH_LOONGSON3A %d\n", _MIPS_ARCH_LOONGSON3A); #endif #ifdef _WIN32 printf("_WIN32 %d\n", _WIN32); #endif #ifdef GG_LONGLONG printf("GG_LONGLONG %d\n", GG_LONGLONG); #endif #ifdef INT_TYPES_DEFINED printf("INT_TYPES_DEFINED\n"); #endif #ifdef __has_feature printf("__has_feature\n"); #if __has_feature(memory_sanitizer) printf("__has_feature(memory_sanitizer) %d\n", __has_feature(memory_sanitizer)); #endif #endif } #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \ defined(_M_X64) TEST_F(LibYUVBaseTest, TestCpuId) { int has_x86 = TestCpuFlag(kCpuHasX86); if (has_x86) { int cpu_info[4]; // Vendor ID: // AuthenticAMD AMD processor // CentaurHauls Centaur processor // CyrixInstead Cyrix processor // GenuineIntel Intel processor // GenuineTMx86 Transmeta processor // Geode by NSC National Semiconductor processor // NexGenDriven NexGen processor // RiseRiseRise Rise Technology processor // SiS SiS SiS SiS processor // UMC UMC UMC UMC processor CpuId(0, 0, cpu_info); cpu_info[0] = cpu_info[1]; // Reorder output cpu_info[1] = cpu_info[3]; cpu_info[3] = 0; printf("Cpu Vendor: %s %x %x %x\n", reinterpret_cast(&cpu_info[0]), cpu_info[0], cpu_info[1], cpu_info[2]); EXPECT_EQ(12u, strlen(reinterpret_cast(&cpu_info[0]))); // CPU Family and Model // 3:0 - Stepping // 7:4 - Model // 11:8 - Family // 13:12 - Processor Type // 19:16 - Extended Model // 27:20 - Extended Family CpuId(1, 0, cpu_info); int family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); int model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model, model); } } #endif static int FileExists(const char* file_name) { FILE* f = fopen(file_name, "r"); if (!f) { return 0; } fclose(f); return 1; } TEST_F(LibYUVBaseTest, TestLinuxNeon) { if (FileExists("../../unit_test/testdata/arm_v7.txt")) { printf("Note: testing to load \"../../unit_test/testdata/arm_v7.txt\"\n"); EXPECT_EQ(0, ArmCpuCaps("../../unit_test/testdata/arm_v7.txt")); EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/tegra3.txt")); EXPECT_EQ(kCpuHasNEON, ArmCpuCaps("../../unit_test/testdata/juno.txt")); } else { printf("WARNING: unable to load \"../../unit_test/testdata/arm_v7.txt\"\n"); } #if defined(__linux__) && defined(__ARM_NEON__) if (FileExists("/proc/cpuinfo")) { if (kCpuHasNEON != ArmCpuCaps("/proc/cpuinfo")) { // This can happen on ARM emulator but /proc/cpuinfo is from host. printf("WARNING: Neon build enabled but CPU does not have NEON\n"); } } else { printf("WARNING: unable to load \"/proc/cpuinfo\"\n"); } #endif } TEST_F(LibYUVBaseTest, TestLinuxMipsMsaMmi) { if (FileExists("../../unit_test/testdata/mips.txt")) { printf("Note: testing to load \"../../unit_test/testdata/mips.txt\"\n"); EXPECT_EQ(0, MipsCpuCaps("../../unit_test/testdata/mips.txt")); EXPECT_EQ(kCpuHasMMI, MipsCpuCaps("../../unit_test/testdata/mips_loongson3.txt")); EXPECT_EQ(kCpuHasMMI, MipsCpuCaps("../../unit_test/testdata/mips_loongson_mmi.txt")); EXPECT_EQ(kCpuHasMSA, MipsCpuCaps("../../unit_test/testdata/mips_msa.txt")); EXPECT_EQ(kCpuHasMMI | kCpuHasMSA, MipsCpuCaps("../../unit_test/testdata/mips_loongson2k.txt")); } else { printf("WARNING: unable to load \"../../unit_test/testdata/mips.txt\"\n"); } } // TODO(fbarchard): Fix clangcl test of cpuflags. #ifdef _MSC_VER TEST_F(LibYUVBaseTest, DISABLED_TestSetCpuFlags) { #else TEST_F(LibYUVBaseTest, TestSetCpuFlags) { #endif // Reset any masked flags that may have been set so auto init is enabled. MaskCpuFlags(0); int original_cpu_flags = TestCpuFlag(-1); // Test setting different CPU configurations. int cpu_flags = kCpuHasARM | kCpuHasNEON | kCpuInitialized; SetCpuFlags(cpu_flags); EXPECT_EQ(cpu_flags, TestCpuFlag(-1)); cpu_flags = kCpuHasX86 | kCpuInitialized; SetCpuFlags(cpu_flags); EXPECT_EQ(cpu_flags, TestCpuFlag(-1)); // Test that setting 0 turns auto-init back on. SetCpuFlags(0); EXPECT_EQ(original_cpu_flags, TestCpuFlag(-1)); // Restore the CPU flag mask. MaskCpuFlags(benchmark_cpu_info_); } } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/cpu_thread_test.cc000066400000000000000000000032121416500237200232260ustar00rootroot00000000000000/* * Copyright 2017 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include "libyuv/cpu_id.h" #if defined(__clang__) && !defined(__wasm__) #if __has_include() #define LIBYUV_HAVE_PTHREAD 1 #endif #elif defined(__linux__) #define LIBYUV_HAVE_PTHREAD 1 #endif #ifdef LIBYUV_HAVE_PTHREAD #include #endif namespace libyuv { #ifdef LIBYUV_HAVE_PTHREAD void* ThreadMain(void* arg) { int* flags = static_cast(arg); *flags = TestCpuFlag(kCpuInitialized); return nullptr; } #endif // LIBYUV_HAVE_PTHREAD // Call TestCpuFlag() from two threads. ThreadSanitizer should not report any // data race. TEST(LibYUVCpuThreadTest, TestCpuFlagMultipleThreads) { #ifdef LIBYUV_HAVE_PTHREAD int cpu_flags1; int cpu_flags2; int ret; pthread_t thread1; pthread_t thread2; MaskCpuFlags(0); // Reset to 0 to allow auto detect. ret = pthread_create(&thread1, nullptr, ThreadMain, &cpu_flags1); ASSERT_EQ(ret, 0); ret = pthread_create(&thread2, nullptr, ThreadMain, &cpu_flags2); ASSERT_EQ(ret, 0); ret = pthread_join(thread1, nullptr); EXPECT_EQ(ret, 0); ret = pthread_join(thread2, nullptr); EXPECT_EQ(ret, 0); EXPECT_EQ(cpu_flags1, cpu_flags2); #else printf("pthread unavailable; Test skipped."); #endif // LIBYUV_HAVE_PTHREAD } } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/math_test.cc000066400000000000000000000121361416500237200220460ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include #include "../unit_test/unit_test.h" #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" #include "libyuv/scale.h" #ifdef ENABLE_ROW_TESTS #include "libyuv/scale_row.h" #endif namespace libyuv { #ifdef ENABLE_ROW_TESTS TEST_F(LibYUVBaseTest, TestFixedDiv) { int num[1280]; int div[1280]; int result_opt[1280]; int result_c[1280]; EXPECT_EQ(0x10000, libyuv::FixedDiv(1, 1)); EXPECT_EQ(0x7fff0000, libyuv::FixedDiv(0x7fff, 1)); // TODO(fbarchard): Avoid the following that throw exceptions. // EXPECT_EQ(0x100000000, libyuv::FixedDiv(0x10000, 1)); // EXPECT_EQ(0x80000000, libyuv::FixedDiv(0x8000, 1)); EXPECT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640)); EXPECT_EQ(0x30000, libyuv::FixedDiv(640 * 3, 640)); EXPECT_EQ(0x40000, libyuv::FixedDiv(640 * 4, 640)); EXPECT_EQ(0x50000, libyuv::FixedDiv(640 * 5, 640)); EXPECT_EQ(0x60000, libyuv::FixedDiv(640 * 6, 640)); EXPECT_EQ(0x70000, libyuv::FixedDiv(640 * 7, 640)); EXPECT_EQ(0x80000, libyuv::FixedDiv(640 * 8, 640)); EXPECT_EQ(0xa0000, libyuv::FixedDiv(640 * 10, 640)); EXPECT_EQ(0x20000, libyuv::FixedDiv(960 * 2, 960)); EXPECT_EQ(0x08000, libyuv::FixedDiv(640 / 2, 640)); EXPECT_EQ(0x04000, libyuv::FixedDiv(640 / 4, 640)); EXPECT_EQ(0x20000, libyuv::FixedDiv(1080 * 2, 1080)); EXPECT_EQ(0x20000, libyuv::FixedDiv(200000, 100000)); EXPECT_EQ(0x18000, libyuv::FixedDiv(150000, 100000)); EXPECT_EQ(0x20000, libyuv::FixedDiv(40000, 20000)); EXPECT_EQ(0x20000, libyuv::FixedDiv(-40000, -20000)); EXPECT_EQ(-0x20000, libyuv::FixedDiv(40000, -20000)); EXPECT_EQ(-0x20000, libyuv::FixedDiv(-40000, 20000)); EXPECT_EQ(0x10000, libyuv::FixedDiv(4095, 4095)); EXPECT_EQ(0x10000, libyuv::FixedDiv(4096, 4096)); EXPECT_EQ(0x10000, libyuv::FixedDiv(4097, 4097)); EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1)); for (int i = 1; i < 4100; ++i) { EXPECT_EQ(0x10000, libyuv::FixedDiv(i, i)); EXPECT_EQ(0x20000, libyuv::FixedDiv(i * 2, i)); EXPECT_EQ(0x30000, libyuv::FixedDiv(i * 3, i)); EXPECT_EQ(0x40000, libyuv::FixedDiv(i * 4, i)); EXPECT_EQ(0x08000, libyuv::FixedDiv(i, i * 2)); EXPECT_NEAR(16384 * 65536 / i, libyuv::FixedDiv(16384, i), 1); } EXPECT_EQ(123 * 65536, libyuv::FixedDiv(123, 1)); MemRandomize(reinterpret_cast(&num[0]), sizeof(num)); MemRandomize(reinterpret_cast(&div[0]), sizeof(div)); for (int j = 0; j < 1280; ++j) { if (div[j] == 0) { div[j] = 1280; } num[j] &= 0xffff; // Clamp to avoid divide overflow. } for (int i = 0; i < benchmark_pixels_div1280_; ++i) { for (int j = 0; j < 1280; ++j) { result_opt[j] = libyuv::FixedDiv(num[j], div[j]); } } for (int j = 0; j < 1280; ++j) { result_c[j] = libyuv::FixedDiv_C(num[j], div[j]); EXPECT_NEAR(result_c[j], result_opt[j], 1); } } TEST_F(LibYUVBaseTest, TestFixedDiv_Opt) { int num[1280]; int div[1280]; int result_opt[1280]; int result_c[1280]; MemRandomize(reinterpret_cast(&num[0]), sizeof(num)); MemRandomize(reinterpret_cast(&div[0]), sizeof(div)); for (int j = 0; j < 1280; ++j) { num[j] &= 4095; // Make numerator smaller. div[j] &= 4095; // Make divisor smaller. if (div[j] == 0) { div[j] = 1280; } } int has_x86 = TestCpuFlag(kCpuHasX86); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { if (has_x86) { for (int j = 0; j < 1280; ++j) { result_opt[j] = libyuv::FixedDiv(num[j], div[j]); } } else { for (int j = 0; j < 1280; ++j) { result_opt[j] = libyuv::FixedDiv_C(num[j], div[j]); } } } for (int j = 0; j < 1280; ++j) { result_c[j] = libyuv::FixedDiv_C(num[j], div[j]); EXPECT_NEAR(result_c[j], result_opt[j], 1); } } TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) { int num[1280]; int div[1280]; int result_opt[1280]; int result_c[1280]; MemRandomize(reinterpret_cast(&num[0]), sizeof(num)); MemRandomize(reinterpret_cast(&div[0]), sizeof(div)); for (int j = 0; j < 1280; ++j) { num[j] &= 4095; // Make numerator smaller. div[j] &= 4095; // Make divisor smaller. if (div[j] <= 1) { div[j] = 1280; } } int has_x86 = TestCpuFlag(kCpuHasX86); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { if (has_x86) { for (int j = 0; j < 1280; ++j) { result_opt[j] = libyuv::FixedDiv1(num[j], div[j]); } } else { for (int j = 0; j < 1280; ++j) { result_opt[j] = libyuv::FixedDiv1_C(num[j], div[j]); } } } for (int j = 0; j < 1280; ++j) { result_c[j] = libyuv::FixedDiv1_C(num[j], div[j]); EXPECT_NEAR(result_c[j], result_opt[j], 1); } } #endif // ENABLE_ROW_TESTS } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/planar_test.cc000066400000000000000000004550711416500237200224030ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include #include "../unit_test/unit_test.h" #include "libyuv/compare.h" #include "libyuv/convert.h" #include "libyuv/convert_argb.h" #include "libyuv/convert_from.h" #include "libyuv/convert_from_argb.h" #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" #include "libyuv/scale.h" #ifdef ENABLE_ROW_TESTS // row.h defines SIMD_ALIGNED, overriding unit_test.h // TODO(fbarchard): Remove row.h from unittests. Test public functions. #include "libyuv/row.h" /* For ScaleSumSamples_Neon */ #endif #if defined(LIBYUV_BIT_EXACT) #define EXPECTED_ATTENUATE_DIFF 0 #else #define EXPECTED_ATTENUATE_DIFF 2 #endif namespace libyuv { TEST_F(LibYUVPlanarTest, TestAttenuate) { const int kSize = 1280 * 4; align_buffer_page_end(orig_pixels, kSize); align_buffer_page_end(atten_pixels, kSize); align_buffer_page_end(unatten_pixels, kSize); align_buffer_page_end(atten2_pixels, kSize); // Test unattenuation clamps orig_pixels[0 * 4 + 0] = 200u; orig_pixels[0 * 4 + 1] = 129u; orig_pixels[0 * 4 + 2] = 127u; orig_pixels[0 * 4 + 3] = 128u; // Test unattenuation transparent and opaque are unaffected orig_pixels[1 * 4 + 0] = 16u; orig_pixels[1 * 4 + 1] = 64u; orig_pixels[1 * 4 + 2] = 192u; orig_pixels[1 * 4 + 3] = 0u; orig_pixels[2 * 4 + 0] = 16u; orig_pixels[2 * 4 + 1] = 64u; orig_pixels[2 * 4 + 2] = 192u; orig_pixels[2 * 4 + 3] = 255u; orig_pixels[3 * 4 + 0] = 16u; orig_pixels[3 * 4 + 1] = 64u; orig_pixels[3 * 4 + 2] = 192u; orig_pixels[3 * 4 + 3] = 128u; ARGBUnattenuate(orig_pixels, 0, unatten_pixels, 0, 4, 1); EXPECT_EQ(255u, unatten_pixels[0 * 4 + 0]); EXPECT_EQ(255u, unatten_pixels[0 * 4 + 1]); EXPECT_EQ(254u, unatten_pixels[0 * 4 + 2]); EXPECT_EQ(128u, unatten_pixels[0 * 4 + 3]); EXPECT_EQ(0u, unatten_pixels[1 * 4 + 0]); EXPECT_EQ(0u, unatten_pixels[1 * 4 + 1]); EXPECT_EQ(0u, unatten_pixels[1 * 4 + 2]); EXPECT_EQ(0u, unatten_pixels[1 * 4 + 3]); EXPECT_EQ(16u, unatten_pixels[2 * 4 + 0]); EXPECT_EQ(64u, unatten_pixels[2 * 4 + 1]); EXPECT_EQ(192u, unatten_pixels[2 * 4 + 2]); EXPECT_EQ(255u, unatten_pixels[2 * 4 + 3]); EXPECT_EQ(32u, unatten_pixels[3 * 4 + 0]); EXPECT_EQ(128u, unatten_pixels[3 * 4 + 1]); EXPECT_EQ(255u, unatten_pixels[3 * 4 + 2]); EXPECT_EQ(128u, unatten_pixels[3 * 4 + 3]); for (int i = 0; i < 1280; ++i) { orig_pixels[i * 4 + 0] = i; orig_pixels[i * 4 + 1] = i / 2; orig_pixels[i * 4 + 2] = i / 3; orig_pixels[i * 4 + 3] = i; } ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 1280, 1); ARGBUnattenuate(atten_pixels, 0, unatten_pixels, 0, 1280, 1); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBAttenuate(unatten_pixels, 0, atten2_pixels, 0, 1280, 1); } for (int i = 0; i < 1280; ++i) { EXPECT_NEAR(atten_pixels[i * 4 + 0], atten2_pixels[i * 4 + 0], 2); EXPECT_NEAR(atten_pixels[i * 4 + 1], atten2_pixels[i * 4 + 1], 2); EXPECT_NEAR(atten_pixels[i * 4 + 2], atten2_pixels[i * 4 + 2], 2); EXPECT_NEAR(atten_pixels[i * 4 + 3], atten2_pixels[i * 4 + 3], 2); } // Make sure transparent, 50% and opaque are fully accurate. EXPECT_EQ(0, atten_pixels[0 * 4 + 0]); EXPECT_EQ(0, atten_pixels[0 * 4 + 1]); EXPECT_EQ(0, atten_pixels[0 * 4 + 2]); EXPECT_EQ(0, atten_pixels[0 * 4 + 3]); EXPECT_EQ(64, atten_pixels[128 * 4 + 0]); EXPECT_EQ(32, atten_pixels[128 * 4 + 1]); EXPECT_EQ(21, atten_pixels[128 * 4 + 2]); EXPECT_EQ(128, atten_pixels[128 * 4 + 3]); EXPECT_NEAR(254, atten_pixels[255 * 4 + 0], EXPECTED_ATTENUATE_DIFF); EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], EXPECTED_ATTENUATE_DIFF); EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], EXPECTED_ATTENUATE_DIFF); EXPECT_EQ(255, atten_pixels[255 * 4 + 3]); free_aligned_buffer_page_end(atten2_pixels); free_aligned_buffer_page_end(unatten_pixels); free_aligned_buffer_page_end(atten_pixels); free_aligned_buffer_page_end(orig_pixels); } static int TestAttenuateI(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off) { if (width < 1) { width = 1; } const int kBpp = 4; const int kStride = width * kBpp; align_buffer_page_end(src_argb, kStride * height + off); align_buffer_page_end(dst_argb_c, kStride * height); align_buffer_page_end(dst_argb_opt, kStride * height); for (int i = 0; i < kStride * height; ++i) { src_argb[i + off] = (fastrand() & 0xff); } memset(dst_argb_c, 0, kStride * height); memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); ARGBAttenuate(src_argb + off, kStride, dst_argb_c, kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { ARGBAttenuate(src_argb + off, kStride, dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i]) - static_cast(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(src_argb); free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); return max_diff; } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Any) { int max_diff = TestAttenuateI(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Unaligned) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Invert) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBAttenuate_Opt) { int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } static int TestUnattenuateI(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off) { if (width < 1) { width = 1; } const int kBpp = 4; const int kStride = width * kBpp; align_buffer_page_end(src_argb, kStride * height + off); align_buffer_page_end(dst_argb_c, kStride * height); align_buffer_page_end(dst_argb_opt, kStride * height); for (int i = 0; i < kStride * height; ++i) { src_argb[i + off] = (fastrand() & 0xff); } ARGBAttenuate(src_argb + off, kStride, src_argb + off, kStride, width, height); memset(dst_argb_c, 0, kStride * height); memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); ARGBUnattenuate(src_argb + off, kStride, dst_argb_c, kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { ARGBUnattenuate(src_argb + off, kStride, dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i]) - static_cast(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(src_argb); free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); return max_diff; } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) { int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, EXPECTED_ATTENUATE_DIFF); } TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) { SIMD_ALIGNED(uint8_t orig_pixels[16][16][4]); SIMD_ALIGNED(int32_t added_pixels[16][16][4]); for (int y = 0; y < 16; ++y) { for (int x = 0; x < 16; ++x) { orig_pixels[y][x][0] = 1u; orig_pixels[y][x][1] = 2u; orig_pixels[y][x][2] = 3u; orig_pixels[y][x][3] = 255u; } } ARGBComputeCumulativeSum(&orig_pixels[0][0][0], 16 * 4, &added_pixels[0][0][0], 16 * 4, 16, 16); for (int y = 0; y < 16; ++y) { for (int x = 0; x < 16; ++x) { EXPECT_EQ((x + 1) * (y + 1), added_pixels[y][x][0]); EXPECT_EQ((x + 1) * (y + 1) * 2, added_pixels[y][x][1]); EXPECT_EQ((x + 1) * (y + 1) * 3, added_pixels[y][x][2]); EXPECT_EQ((x + 1) * (y + 1) * 255, added_pixels[y][x][3]); } } } // near is for legacy platforms. TEST_F(LibYUVPlanarTest, TestARGBGray) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); memset(orig_pixels, 0, sizeof(orig_pixels)); // Test blue orig_pixels[0][0] = 255u; orig_pixels[0][1] = 0u; orig_pixels[0][2] = 0u; orig_pixels[0][3] = 128u; // Test green orig_pixels[1][0] = 0u; orig_pixels[1][1] = 255u; orig_pixels[1][2] = 0u; orig_pixels[1][3] = 0u; // Test red orig_pixels[2][0] = 0u; orig_pixels[2][1] = 0u; orig_pixels[2][2] = 255u; orig_pixels[2][3] = 255u; // Test black orig_pixels[3][0] = 0u; orig_pixels[3][1] = 0u; orig_pixels[3][2] = 0u; orig_pixels[3][3] = 255u; // Test white orig_pixels[4][0] = 255u; orig_pixels[4][1] = 255u; orig_pixels[4][2] = 255u; orig_pixels[4][3] = 255u; // Test color orig_pixels[5][0] = 16u; orig_pixels[5][1] = 64u; orig_pixels[5][2] = 192u; orig_pixels[5][3] = 224u; // Do 16 to test asm version. ARGBGray(&orig_pixels[0][0], 0, 0, 0, 16, 1); EXPECT_NEAR(29u, orig_pixels[0][0], 1); EXPECT_NEAR(29u, orig_pixels[0][1], 1); EXPECT_NEAR(29u, orig_pixels[0][2], 1); EXPECT_EQ(128u, orig_pixels[0][3]); EXPECT_EQ(149u, orig_pixels[1][0]); EXPECT_EQ(149u, orig_pixels[1][1]); EXPECT_EQ(149u, orig_pixels[1][2]); EXPECT_EQ(0u, orig_pixels[1][3]); EXPECT_NEAR(77u, orig_pixels[2][0], 1); EXPECT_NEAR(77u, orig_pixels[2][1], 1); EXPECT_NEAR(77u, orig_pixels[2][2], 1); EXPECT_EQ(255u, orig_pixels[2][3]); EXPECT_EQ(0u, orig_pixels[3][0]); EXPECT_EQ(0u, orig_pixels[3][1]); EXPECT_EQ(0u, orig_pixels[3][2]); EXPECT_EQ(255u, orig_pixels[3][3]); EXPECT_EQ(255u, orig_pixels[4][0]); EXPECT_EQ(255u, orig_pixels[4][1]); EXPECT_EQ(255u, orig_pixels[4][2]); EXPECT_EQ(255u, orig_pixels[4][3]); EXPECT_NEAR(97u, orig_pixels[5][0], 1); EXPECT_NEAR(97u, orig_pixels[5][1], 1); EXPECT_NEAR(97u, orig_pixels[5][2], 1); EXPECT_EQ(224u, orig_pixels[5][3]); for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBGray(&orig_pixels[0][0], 0, 0, 0, 1280, 1); } } TEST_F(LibYUVPlanarTest, TestARGBGrayTo) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); SIMD_ALIGNED(uint8_t gray_pixels[1280][4]); memset(orig_pixels, 0, sizeof(orig_pixels)); // Test blue orig_pixels[0][0] = 255u; orig_pixels[0][1] = 0u; orig_pixels[0][2] = 0u; orig_pixels[0][3] = 128u; // Test green orig_pixels[1][0] = 0u; orig_pixels[1][1] = 255u; orig_pixels[1][2] = 0u; orig_pixels[1][3] = 0u; // Test red orig_pixels[2][0] = 0u; orig_pixels[2][1] = 0u; orig_pixels[2][2] = 255u; orig_pixels[2][3] = 255u; // Test black orig_pixels[3][0] = 0u; orig_pixels[3][1] = 0u; orig_pixels[3][2] = 0u; orig_pixels[3][3] = 255u; // Test white orig_pixels[4][0] = 255u; orig_pixels[4][1] = 255u; orig_pixels[4][2] = 255u; orig_pixels[4][3] = 255u; // Test color orig_pixels[5][0] = 16u; orig_pixels[5][1] = 64u; orig_pixels[5][2] = 192u; orig_pixels[5][3] = 224u; // Do 16 to test asm version. ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 16, 1); EXPECT_NEAR(30u, gray_pixels[0][0], 1); EXPECT_NEAR(30u, gray_pixels[0][1], 1); EXPECT_NEAR(30u, gray_pixels[0][2], 1); EXPECT_NEAR(128u, gray_pixels[0][3], 1); EXPECT_NEAR(149u, gray_pixels[1][0], 1); EXPECT_NEAR(149u, gray_pixels[1][1], 1); EXPECT_NEAR(149u, gray_pixels[1][2], 1); EXPECT_NEAR(0u, gray_pixels[1][3], 1); EXPECT_NEAR(76u, gray_pixels[2][0], 1); EXPECT_NEAR(76u, gray_pixels[2][1], 1); EXPECT_NEAR(76u, gray_pixels[2][2], 1); EXPECT_NEAR(255u, gray_pixels[2][3], 1); EXPECT_NEAR(0u, gray_pixels[3][0], 1); EXPECT_NEAR(0u, gray_pixels[3][1], 1); EXPECT_NEAR(0u, gray_pixels[3][2], 1); EXPECT_NEAR(255u, gray_pixels[3][3], 1); EXPECT_NEAR(255u, gray_pixels[4][0], 1); EXPECT_NEAR(255u, gray_pixels[4][1], 1); EXPECT_NEAR(255u, gray_pixels[4][2], 1); EXPECT_NEAR(255u, gray_pixels[4][3], 1); EXPECT_NEAR(96u, gray_pixels[5][0], 1); EXPECT_NEAR(96u, gray_pixels[5][1], 1); EXPECT_NEAR(96u, gray_pixels[5][2], 1); EXPECT_NEAR(224u, gray_pixels[5][3], 1); for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBGrayTo(&orig_pixels[0][0], 0, &gray_pixels[0][0], 0, 1280, 1); } for (int i = 0; i < 256; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i; orig_pixels[i][2] = i; orig_pixels[i][3] = i; } ARGBGray(&orig_pixels[0][0], 0, 0, 0, 256, 1); for (int i = 0; i < 256; ++i) { EXPECT_EQ(i, orig_pixels[i][0]); EXPECT_EQ(i, orig_pixels[i][1]); EXPECT_EQ(i, orig_pixels[i][2]); EXPECT_EQ(i, orig_pixels[i][3]); } } TEST_F(LibYUVPlanarTest, TestARGBSepia) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); memset(orig_pixels, 0, sizeof(orig_pixels)); // Test blue orig_pixels[0][0] = 255u; orig_pixels[0][1] = 0u; orig_pixels[0][2] = 0u; orig_pixels[0][3] = 128u; // Test green orig_pixels[1][0] = 0u; orig_pixels[1][1] = 255u; orig_pixels[1][2] = 0u; orig_pixels[1][3] = 0u; // Test red orig_pixels[2][0] = 0u; orig_pixels[2][1] = 0u; orig_pixels[2][2] = 255u; orig_pixels[2][3] = 255u; // Test black orig_pixels[3][0] = 0u; orig_pixels[3][1] = 0u; orig_pixels[3][2] = 0u; orig_pixels[3][3] = 255u; // Test white orig_pixels[4][0] = 255u; orig_pixels[4][1] = 255u; orig_pixels[4][2] = 255u; orig_pixels[4][3] = 255u; // Test color orig_pixels[5][0] = 16u; orig_pixels[5][1] = 64u; orig_pixels[5][2] = 192u; orig_pixels[5][3] = 224u; // Do 16 to test asm version. ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 16, 1); EXPECT_EQ(33u, orig_pixels[0][0]); EXPECT_EQ(43u, orig_pixels[0][1]); EXPECT_EQ(47u, orig_pixels[0][2]); EXPECT_EQ(128u, orig_pixels[0][3]); EXPECT_EQ(135u, orig_pixels[1][0]); EXPECT_EQ(175u, orig_pixels[1][1]); EXPECT_EQ(195u, orig_pixels[1][2]); EXPECT_EQ(0u, orig_pixels[1][3]); EXPECT_EQ(69u, orig_pixels[2][0]); EXPECT_EQ(89u, orig_pixels[2][1]); EXPECT_EQ(99u, orig_pixels[2][2]); EXPECT_EQ(255u, orig_pixels[2][3]); EXPECT_EQ(0u, orig_pixels[3][0]); EXPECT_EQ(0u, orig_pixels[3][1]); EXPECT_EQ(0u, orig_pixels[3][2]); EXPECT_EQ(255u, orig_pixels[3][3]); EXPECT_EQ(239u, orig_pixels[4][0]); EXPECT_EQ(255u, orig_pixels[4][1]); EXPECT_EQ(255u, orig_pixels[4][2]); EXPECT_EQ(255u, orig_pixels[4][3]); EXPECT_EQ(88u, orig_pixels[5][0]); EXPECT_EQ(114u, orig_pixels[5][1]); EXPECT_EQ(127u, orig_pixels[5][2]); EXPECT_EQ(224u, orig_pixels[5][3]); for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBSepia(&orig_pixels[0][0], 0, 0, 0, 1280, 1); } } TEST_F(LibYUVPlanarTest, TestARGBColorMatrix) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]); SIMD_ALIGNED(uint8_t dst_pixels_c[1280][4]); // Matrix for Sepia. SIMD_ALIGNED(static const int8_t kRGBToSepia[]) = { 17 / 2, 68 / 2, 35 / 2, 0, 22 / 2, 88 / 2, 45 / 2, 0, 24 / 2, 98 / 2, 50 / 2, 0, 0, 0, 0, 64, // Copy alpha. }; memset(orig_pixels, 0, sizeof(orig_pixels)); // Test blue orig_pixels[0][0] = 255u; orig_pixels[0][1] = 0u; orig_pixels[0][2] = 0u; orig_pixels[0][3] = 128u; // Test green orig_pixels[1][0] = 0u; orig_pixels[1][1] = 255u; orig_pixels[1][2] = 0u; orig_pixels[1][3] = 0u; // Test red orig_pixels[2][0] = 0u; orig_pixels[2][1] = 0u; orig_pixels[2][2] = 255u; orig_pixels[2][3] = 255u; // Test color orig_pixels[3][0] = 16u; orig_pixels[3][1] = 64u; orig_pixels[3][2] = 192u; orig_pixels[3][3] = 224u; // Do 16 to test asm version. ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, &kRGBToSepia[0], 16, 1); EXPECT_EQ(31u, dst_pixels_opt[0][0]); EXPECT_EQ(43u, dst_pixels_opt[0][1]); EXPECT_EQ(47u, dst_pixels_opt[0][2]); EXPECT_EQ(128u, dst_pixels_opt[0][3]); EXPECT_EQ(135u, dst_pixels_opt[1][0]); EXPECT_EQ(175u, dst_pixels_opt[1][1]); EXPECT_EQ(195u, dst_pixels_opt[1][2]); EXPECT_EQ(0u, dst_pixels_opt[1][3]); EXPECT_EQ(67u, dst_pixels_opt[2][0]); EXPECT_EQ(87u, dst_pixels_opt[2][1]); EXPECT_EQ(99u, dst_pixels_opt[2][2]); EXPECT_EQ(255u, dst_pixels_opt[2][3]); EXPECT_EQ(87u, dst_pixels_opt[3][0]); EXPECT_EQ(112u, dst_pixels_opt[3][1]); EXPECT_EQ(127u, dst_pixels_opt[3][2]); EXPECT_EQ(224u, dst_pixels_opt[3][3]); for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } MaskCpuFlags(disable_cpu_flags_); ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0, &kRGBToSepia[0], 1280, 1); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBColorMatrix(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, &kRGBToSepia[0], 1280, 1); } for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]); EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]); EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]); EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]); } } TEST_F(LibYUVPlanarTest, TestRGBColorMatrix) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); // Matrix for Sepia. SIMD_ALIGNED(static const int8_t kRGBToSepia[]) = { 17, 68, 35, 0, 22, 88, 45, 0, 24, 98, 50, 0, 0, 0, 0, 0, // Unused but makes matrix 16 bytes. }; memset(orig_pixels, 0, sizeof(orig_pixels)); // Test blue orig_pixels[0][0] = 255u; orig_pixels[0][1] = 0u; orig_pixels[0][2] = 0u; orig_pixels[0][3] = 128u; // Test green orig_pixels[1][0] = 0u; orig_pixels[1][1] = 255u; orig_pixels[1][2] = 0u; orig_pixels[1][3] = 0u; // Test red orig_pixels[2][0] = 0u; orig_pixels[2][1] = 0u; orig_pixels[2][2] = 255u; orig_pixels[2][3] = 255u; // Test color orig_pixels[3][0] = 16u; orig_pixels[3][1] = 64u; orig_pixels[3][2] = 192u; orig_pixels[3][3] = 224u; // Do 16 to test asm version. RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 16, 1); EXPECT_EQ(31u, orig_pixels[0][0]); EXPECT_EQ(43u, orig_pixels[0][1]); EXPECT_EQ(47u, orig_pixels[0][2]); EXPECT_EQ(128u, orig_pixels[0][3]); EXPECT_EQ(135u, orig_pixels[1][0]); EXPECT_EQ(175u, orig_pixels[1][1]); EXPECT_EQ(195u, orig_pixels[1][2]); EXPECT_EQ(0u, orig_pixels[1][3]); EXPECT_EQ(67u, orig_pixels[2][0]); EXPECT_EQ(87u, orig_pixels[2][1]); EXPECT_EQ(99u, orig_pixels[2][2]); EXPECT_EQ(255u, orig_pixels[2][3]); EXPECT_EQ(87u, orig_pixels[3][0]); EXPECT_EQ(112u, orig_pixels[3][1]); EXPECT_EQ(127u, orig_pixels[3][2]); EXPECT_EQ(224u, orig_pixels[3][3]); for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } for (int i = 0; i < benchmark_pixels_div1280_; ++i) { RGBColorMatrix(&orig_pixels[0][0], 0, &kRGBToSepia[0], 0, 0, 1280, 1); } } TEST_F(LibYUVPlanarTest, TestARGBColorTable) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); memset(orig_pixels, 0, sizeof(orig_pixels)); // Matrix for Sepia. static const uint8_t kARGBTable[256 * 4] = { 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 16u, }; orig_pixels[0][0] = 0u; orig_pixels[0][1] = 0u; orig_pixels[0][2] = 0u; orig_pixels[0][3] = 0u; orig_pixels[1][0] = 1u; orig_pixels[1][1] = 1u; orig_pixels[1][2] = 1u; orig_pixels[1][3] = 1u; orig_pixels[2][0] = 2u; orig_pixels[2][1] = 2u; orig_pixels[2][2] = 2u; orig_pixels[2][3] = 2u; orig_pixels[3][0] = 0u; orig_pixels[3][1] = 1u; orig_pixels[3][2] = 2u; orig_pixels[3][3] = 3u; // Do 16 to test asm version. ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1); EXPECT_EQ(1u, orig_pixels[0][0]); EXPECT_EQ(2u, orig_pixels[0][1]); EXPECT_EQ(3u, orig_pixels[0][2]); EXPECT_EQ(4u, orig_pixels[0][3]); EXPECT_EQ(5u, orig_pixels[1][0]); EXPECT_EQ(6u, orig_pixels[1][1]); EXPECT_EQ(7u, orig_pixels[1][2]); EXPECT_EQ(8u, orig_pixels[1][3]); EXPECT_EQ(9u, orig_pixels[2][0]); EXPECT_EQ(10u, orig_pixels[2][1]); EXPECT_EQ(11u, orig_pixels[2][2]); EXPECT_EQ(12u, orig_pixels[2][3]); EXPECT_EQ(1u, orig_pixels[3][0]); EXPECT_EQ(6u, orig_pixels[3][1]); EXPECT_EQ(11u, orig_pixels[3][2]); EXPECT_EQ(16u, orig_pixels[3][3]); for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1); } } // Same as TestARGBColorTable except alpha does not change. TEST_F(LibYUVPlanarTest, TestRGBColorTable) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); memset(orig_pixels, 0, sizeof(orig_pixels)); // Matrix for Sepia. static const uint8_t kARGBTable[256 * 4] = { 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u, 15u, 16u, }; orig_pixels[0][0] = 0u; orig_pixels[0][1] = 0u; orig_pixels[0][2] = 0u; orig_pixels[0][3] = 0u; orig_pixels[1][0] = 1u; orig_pixels[1][1] = 1u; orig_pixels[1][2] = 1u; orig_pixels[1][3] = 1u; orig_pixels[2][0] = 2u; orig_pixels[2][1] = 2u; orig_pixels[2][2] = 2u; orig_pixels[2][3] = 2u; orig_pixels[3][0] = 0u; orig_pixels[3][1] = 1u; orig_pixels[3][2] = 2u; orig_pixels[3][3] = 3u; // Do 16 to test asm version. RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 16, 1); EXPECT_EQ(1u, orig_pixels[0][0]); EXPECT_EQ(2u, orig_pixels[0][1]); EXPECT_EQ(3u, orig_pixels[0][2]); EXPECT_EQ(0u, orig_pixels[0][3]); // Alpha unchanged. EXPECT_EQ(5u, orig_pixels[1][0]); EXPECT_EQ(6u, orig_pixels[1][1]); EXPECT_EQ(7u, orig_pixels[1][2]); EXPECT_EQ(1u, orig_pixels[1][3]); // Alpha unchanged. EXPECT_EQ(9u, orig_pixels[2][0]); EXPECT_EQ(10u, orig_pixels[2][1]); EXPECT_EQ(11u, orig_pixels[2][2]); EXPECT_EQ(2u, orig_pixels[2][3]); // Alpha unchanged. EXPECT_EQ(1u, orig_pixels[3][0]); EXPECT_EQ(6u, orig_pixels[3][1]); EXPECT_EQ(11u, orig_pixels[3][2]); EXPECT_EQ(3u, orig_pixels[3][3]); // Alpha unchanged. for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } for (int i = 0; i < benchmark_pixels_div1280_; ++i) { RGBColorTable(&orig_pixels[0][0], 0, &kARGBTable[0], 0, 0, 1280, 1); } } TEST_F(LibYUVPlanarTest, TestARGBQuantize) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } ARGBQuantize(&orig_pixels[0][0], 0, (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1); for (int i = 0; i < 1280; ++i) { EXPECT_EQ((i / 8 * 8 + 8 / 2) & 255, orig_pixels[i][0]); EXPECT_EQ((i / 2 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][1]); EXPECT_EQ((i / 3 / 8 * 8 + 8 / 2) & 255, orig_pixels[i][2]); EXPECT_EQ(i & 255, orig_pixels[i][3]); } for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBQuantize(&orig_pixels[0][0], 0, (65536 + (8 / 2)) / 8, 8, 8 / 2, 0, 0, 1280, 1); } } TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) { align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4); align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_ * 4); align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4); MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4); MaskCpuFlags(disable_cpu_flags_); ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c, benchmark_width_ * 4, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, benchmark_height_); } for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) { align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_); align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_); align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_); MemRandomize(src_pixels, benchmark_width_ * benchmark_height_); MaskCpuFlags(disable_cpu_flags_); MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_, benchmark_width_, benchmark_height_); } for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) { align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2); align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_ * 2); align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2); MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2); MaskCpuFlags(disable_cpu_flags_); MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c, benchmark_width_ * 2, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt, benchmark_width_ * 2, benchmark_width_, benchmark_height_); } for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, TestShade) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); SIMD_ALIGNED(uint8_t shade_pixels[1280][4]); memset(orig_pixels, 0, sizeof(orig_pixels)); orig_pixels[0][0] = 10u; orig_pixels[0][1] = 20u; orig_pixels[0][2] = 40u; orig_pixels[0][3] = 80u; orig_pixels[1][0] = 0u; orig_pixels[1][1] = 0u; orig_pixels[1][2] = 0u; orig_pixels[1][3] = 255u; orig_pixels[2][0] = 0u; orig_pixels[2][1] = 0u; orig_pixels[2][2] = 0u; orig_pixels[2][3] = 0u; orig_pixels[3][0] = 0u; orig_pixels[3][1] = 0u; orig_pixels[3][2] = 0u; orig_pixels[3][3] = 0u; // Do 8 pixels to allow opt version to be used. ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80ffffff); EXPECT_EQ(10u, shade_pixels[0][0]); EXPECT_EQ(20u, shade_pixels[0][1]); EXPECT_EQ(40u, shade_pixels[0][2]); EXPECT_EQ(40u, shade_pixels[0][3]); EXPECT_EQ(0u, shade_pixels[1][0]); EXPECT_EQ(0u, shade_pixels[1][1]); EXPECT_EQ(0u, shade_pixels[1][2]); EXPECT_EQ(128u, shade_pixels[1][3]); EXPECT_EQ(0u, shade_pixels[2][0]); EXPECT_EQ(0u, shade_pixels[2][1]); EXPECT_EQ(0u, shade_pixels[2][2]); EXPECT_EQ(0u, shade_pixels[2][3]); EXPECT_EQ(0u, shade_pixels[3][0]); EXPECT_EQ(0u, shade_pixels[3][1]); EXPECT_EQ(0u, shade_pixels[3][2]); EXPECT_EQ(0u, shade_pixels[3][3]); ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80808080); EXPECT_EQ(5u, shade_pixels[0][0]); EXPECT_EQ(10u, shade_pixels[0][1]); EXPECT_EQ(20u, shade_pixels[0][2]); EXPECT_EQ(40u, shade_pixels[0][3]); ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x10204080); EXPECT_EQ(5u, shade_pixels[0][0]); EXPECT_EQ(5u, shade_pixels[0][1]); EXPECT_EQ(5u, shade_pixels[0][2]); EXPECT_EQ(5u, shade_pixels[0][3]); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 1280, 1, 0x80808080); } } TEST_F(LibYUVPlanarTest, TestARGBInterpolate) { SIMD_ALIGNED(uint8_t orig_pixels_0[1280][4]); SIMD_ALIGNED(uint8_t orig_pixels_1[1280][4]); SIMD_ALIGNED(uint8_t interpolate_pixels[1280][4]); memset(orig_pixels_0, 0, sizeof(orig_pixels_0)); memset(orig_pixels_1, 0, sizeof(orig_pixels_1)); orig_pixels_0[0][0] = 16u; orig_pixels_0[0][1] = 32u; orig_pixels_0[0][2] = 64u; orig_pixels_0[0][3] = 128u; orig_pixels_0[1][0] = 0u; orig_pixels_0[1][1] = 0u; orig_pixels_0[1][2] = 0u; orig_pixels_0[1][3] = 255u; orig_pixels_0[2][0] = 0u; orig_pixels_0[2][1] = 0u; orig_pixels_0[2][2] = 0u; orig_pixels_0[2][3] = 0u; orig_pixels_0[3][0] = 0u; orig_pixels_0[3][1] = 0u; orig_pixels_0[3][2] = 0u; orig_pixels_0[3][3] = 0u; orig_pixels_1[0][0] = 0u; orig_pixels_1[0][1] = 0u; orig_pixels_1[0][2] = 0u; orig_pixels_1[0][3] = 0u; orig_pixels_1[1][0] = 0u; orig_pixels_1[1][1] = 0u; orig_pixels_1[1][2] = 0u; orig_pixels_1[1][3] = 0u; orig_pixels_1[2][0] = 0u; orig_pixels_1[2][1] = 0u; orig_pixels_1[2][2] = 0u; orig_pixels_1[2][3] = 0u; orig_pixels_1[3][0] = 255u; orig_pixels_1[3][1] = 255u; orig_pixels_1[3][2] = 255u; orig_pixels_1[3][3] = 255u; ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, &interpolate_pixels[0][0], 0, 4, 1, 128); EXPECT_EQ(8u, interpolate_pixels[0][0]); EXPECT_EQ(16u, interpolate_pixels[0][1]); EXPECT_EQ(32u, interpolate_pixels[0][2]); EXPECT_EQ(64u, interpolate_pixels[0][3]); EXPECT_EQ(0u, interpolate_pixels[1][0]); EXPECT_EQ(0u, interpolate_pixels[1][1]); EXPECT_EQ(0u, interpolate_pixels[1][2]); EXPECT_EQ(128u, interpolate_pixels[1][3]); EXPECT_EQ(0u, interpolate_pixels[2][0]); EXPECT_EQ(0u, interpolate_pixels[2][1]); EXPECT_EQ(0u, interpolate_pixels[2][2]); EXPECT_EQ(0u, interpolate_pixels[2][3]); EXPECT_EQ(128u, interpolate_pixels[3][0]); EXPECT_EQ(128u, interpolate_pixels[3][1]); EXPECT_EQ(128u, interpolate_pixels[3][2]); EXPECT_EQ(128u, interpolate_pixels[3][3]); ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, &interpolate_pixels[0][0], 0, 4, 1, 0); EXPECT_EQ(16u, interpolate_pixels[0][0]); EXPECT_EQ(32u, interpolate_pixels[0][1]); EXPECT_EQ(64u, interpolate_pixels[0][2]); EXPECT_EQ(128u, interpolate_pixels[0][3]); ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, &interpolate_pixels[0][0], 0, 4, 1, 192); EXPECT_EQ(4u, interpolate_pixels[0][0]); EXPECT_EQ(8u, interpolate_pixels[0][1]); EXPECT_EQ(16u, interpolate_pixels[0][2]); EXPECT_EQ(32u, interpolate_pixels[0][3]); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBInterpolate(&orig_pixels_0[0][0], 0, &orig_pixels_1[0][0], 0, &interpolate_pixels[0][0], 0, 1280, 1, 128); } } TEST_F(LibYUVPlanarTest, TestInterpolatePlane) { SIMD_ALIGNED(uint8_t orig_pixels_0[1280]); SIMD_ALIGNED(uint8_t orig_pixels_1[1280]); SIMD_ALIGNED(uint8_t interpolate_pixels[1280]); memset(orig_pixels_0, 0, sizeof(orig_pixels_0)); memset(orig_pixels_1, 0, sizeof(orig_pixels_1)); orig_pixels_0[0] = 16u; orig_pixels_0[1] = 32u; orig_pixels_0[2] = 64u; orig_pixels_0[3] = 128u; orig_pixels_0[4] = 0u; orig_pixels_0[5] = 0u; orig_pixels_0[6] = 0u; orig_pixels_0[7] = 255u; orig_pixels_0[8] = 0u; orig_pixels_0[9] = 0u; orig_pixels_0[10] = 0u; orig_pixels_0[11] = 0u; orig_pixels_0[12] = 0u; orig_pixels_0[13] = 0u; orig_pixels_0[14] = 0u; orig_pixels_0[15] = 0u; orig_pixels_1[0] = 0u; orig_pixels_1[1] = 0u; orig_pixels_1[2] = 0u; orig_pixels_1[3] = 0u; orig_pixels_1[4] = 0u; orig_pixels_1[5] = 0u; orig_pixels_1[6] = 0u; orig_pixels_1[7] = 0u; orig_pixels_1[8] = 0u; orig_pixels_1[9] = 0u; orig_pixels_1[10] = 0u; orig_pixels_1[11] = 0u; orig_pixels_1[12] = 255u; orig_pixels_1[13] = 255u; orig_pixels_1[14] = 255u; orig_pixels_1[15] = 255u; InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0, &interpolate_pixels[0], 0, 16, 1, 128); EXPECT_EQ(8u, interpolate_pixels[0]); EXPECT_EQ(16u, interpolate_pixels[1]); EXPECT_EQ(32u, interpolate_pixels[2]); EXPECT_EQ(64u, interpolate_pixels[3]); EXPECT_EQ(0u, interpolate_pixels[4]); EXPECT_EQ(0u, interpolate_pixels[5]); EXPECT_EQ(0u, interpolate_pixels[6]); EXPECT_EQ(128u, interpolate_pixels[7]); EXPECT_EQ(0u, interpolate_pixels[8]); EXPECT_EQ(0u, interpolate_pixels[9]); EXPECT_EQ(0u, interpolate_pixels[10]); EXPECT_EQ(0u, interpolate_pixels[11]); EXPECT_EQ(128u, interpolate_pixels[12]); EXPECT_EQ(128u, interpolate_pixels[13]); EXPECT_EQ(128u, interpolate_pixels[14]); EXPECT_EQ(128u, interpolate_pixels[15]); InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0, &interpolate_pixels[0], 0, 16, 1, 0); EXPECT_EQ(16u, interpolate_pixels[0]); EXPECT_EQ(32u, interpolate_pixels[1]); EXPECT_EQ(64u, interpolate_pixels[2]); EXPECT_EQ(128u, interpolate_pixels[3]); InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0, &interpolate_pixels[0], 0, 16, 1, 192); EXPECT_EQ(4u, interpolate_pixels[0]); EXPECT_EQ(8u, interpolate_pixels[1]); EXPECT_EQ(16u, interpolate_pixels[2]); EXPECT_EQ(32u, interpolate_pixels[3]); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { InterpolatePlane(&orig_pixels_0[0], 0, &orig_pixels_1[0], 0, &interpolate_pixels[0], 0, 1280, 1, 123); } } #define TESTTERP(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B, STRIDE_B, W1280, TERP, \ N, NEG, OFF) \ TEST_F(LibYUVPlanarTest, ARGBInterpolate##TERP##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kStrideA = \ (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ const int kStrideB = \ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF); \ align_buffer_page_end(src_argb_b, kStrideA* kHeight + OFF); \ align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \ align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \ for (int i = 0; i < kStrideA * kHeight; ++i) { \ src_argb_a[i + OFF] = (fastrand() & 0xff); \ src_argb_b[i + OFF] = (fastrand() & 0xff); \ } \ MaskCpuFlags(disable_cpu_flags_); \ ARGBInterpolate(src_argb_a + OFF, kStrideA, src_argb_b + OFF, kStrideA, \ dst_argb_c, kStrideB, kWidth, NEG kHeight, TERP); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ ARGBInterpolate(src_argb_a + OFF, kStrideA, src_argb_b + OFF, kStrideA, \ dst_argb_opt, kStrideB, kWidth, NEG kHeight, TERP); \ } \ for (int i = 0; i < kStrideB * kHeight; ++i) { \ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ free_aligned_buffer_page_end(src_argb_a); \ free_aligned_buffer_page_end(src_argb_b); \ free_aligned_buffer_page_end(dst_argb_c); \ free_aligned_buffer_page_end(dst_argb_opt); \ } #define TESTINTERPOLATE(TERP) \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_ + 1, TERP, _Any, +, 0) \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Unaligned, +, 1) \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Invert, -, 0) \ TESTTERP(ARGB, 4, 1, ARGB, 4, 1, benchmark_width_, TERP, _Opt, +, 0) TESTINTERPOLATE(0) TESTINTERPOLATE(64) TESTINTERPOLATE(128) TESTINTERPOLATE(192) TESTINTERPOLATE(255) static int TestBlend(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off, int attenuate) { if (width < 1) { width = 1; } const int kBpp = 4; const int kStride = width * kBpp; align_buffer_page_end(src_argb_a, kStride * height + off); align_buffer_page_end(src_argb_b, kStride * height + off); align_buffer_page_end(dst_argb_c, kStride * height); align_buffer_page_end(dst_argb_opt, kStride * height); for (int i = 0; i < kStride * height; ++i) { src_argb_a[i + off] = (fastrand() & 0xff); src_argb_b[i + off] = (fastrand() & 0xff); } MemRandomize(src_argb_a, kStride * height + off); MemRandomize(src_argb_b, kStride * height + off); if (attenuate) { ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width, height); } memset(dst_argb_c, 255, kStride * height); memset(dst_argb_opt, 255, kStride * height); MaskCpuFlags(disable_cpu_flags); ARGBBlend(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c, kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { ARGBBlend(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i]) - static_cast(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(src_argb_a); free_aligned_buffer_page_end(src_argb_b); free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); return max_diff; } TEST_F(LibYUVPlanarTest, ARGBBlend_Any) { int max_diff = TestBlend(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) { int max_diff = TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) { int max_diff = TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Unattenuated) { int max_diff = TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) { int max_diff = TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1); EXPECT_LE(max_diff, 1); } static void TestBlendPlane(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off) { if (width < 1) { width = 1; } const int kBpp = 1; const int kStride = width * kBpp; align_buffer_page_end(src_argb_a, kStride * height + off); align_buffer_page_end(src_argb_b, kStride * height + off); align_buffer_page_end(src_argb_alpha, kStride * height + off); align_buffer_page_end(dst_argb_c, kStride * height + off); align_buffer_page_end(dst_argb_opt, kStride * height + off); memset(dst_argb_c, 255, kStride * height + off); memset(dst_argb_opt, 255, kStride * height + off); // Test source is maintained exactly if alpha is 255. for (int i = 0; i < width; ++i) { src_argb_a[i + off] = i & 255; src_argb_b[i + off] = 255 - (i & 255); } memset(src_argb_alpha + off, 255, width); BlendPlane(src_argb_a + off, width, src_argb_b + off, width, src_argb_alpha + off, width, dst_argb_opt + off, width, width, 1); for (int i = 0; i < width; ++i) { EXPECT_EQ(src_argb_a[i + off], dst_argb_opt[i + off]); } // Test destination is maintained exactly if alpha is 0. memset(src_argb_alpha + off, 0, width); BlendPlane(src_argb_a + off, width, src_argb_b + off, width, src_argb_alpha + off, width, dst_argb_opt + off, width, width, 1); for (int i = 0; i < width; ++i) { EXPECT_EQ(src_argb_b[i + off], dst_argb_opt[i + off]); } for (int i = 0; i < kStride * height; ++i) { src_argb_a[i + off] = (fastrand() & 0xff); src_argb_b[i + off] = (fastrand() & 0xff); src_argb_alpha[i + off] = (fastrand() & 0xff); } MaskCpuFlags(disable_cpu_flags); BlendPlane(src_argb_a + off, width, src_argb_b + off, width, src_argb_alpha + off, width, dst_argb_c + off, width, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { BlendPlane(src_argb_a + off, width, src_argb_b + off, width, src_argb_alpha + off, width, dst_argb_opt + off, width, width, invert * height); } for (int i = 0; i < kStride * height; ++i) { EXPECT_EQ(dst_argb_c[i + off], dst_argb_opt[i + off]); } free_aligned_buffer_page_end(src_argb_a); free_aligned_buffer_page_end(src_argb_b); free_aligned_buffer_page_end(src_argb_alpha); free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); } TEST_F(LibYUVPlanarTest, BlendPlane_Opt) { TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); } TEST_F(LibYUVPlanarTest, BlendPlane_Unaligned) { TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); } TEST_F(LibYUVPlanarTest, BlendPlane_Any) { TestBlendPlane(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); } TEST_F(LibYUVPlanarTest, BlendPlane_Invert) { TestBlendPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 1); } #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) static void TestI420Blend(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off) { width = ((width) > 0) ? (width) : 1; const int kStrideUV = SUBSAMPLE(width, 2); const int kSizeUV = kStrideUV * SUBSAMPLE(height, 2); align_buffer_page_end(src_y0, width * height + off); align_buffer_page_end(src_u0, kSizeUV + off); align_buffer_page_end(src_v0, kSizeUV + off); align_buffer_page_end(src_y1, width * height + off); align_buffer_page_end(src_u1, kSizeUV + off); align_buffer_page_end(src_v1, kSizeUV + off); align_buffer_page_end(src_a, width * height + off); align_buffer_page_end(dst_y_c, width * height + off); align_buffer_page_end(dst_u_c, kSizeUV + off); align_buffer_page_end(dst_v_c, kSizeUV + off); align_buffer_page_end(dst_y_opt, width * height + off); align_buffer_page_end(dst_u_opt, kSizeUV + off); align_buffer_page_end(dst_v_opt, kSizeUV + off); MemRandomize(src_y0, width * height + off); MemRandomize(src_u0, kSizeUV + off); MemRandomize(src_v0, kSizeUV + off); MemRandomize(src_y1, width * height + off); MemRandomize(src_u1, kSizeUV + off); MemRandomize(src_v1, kSizeUV + off); MemRandomize(src_a, width * height + off); memset(dst_y_c, 255, width * height + off); memset(dst_u_c, 255, kSizeUV + off); memset(dst_v_c, 255, kSizeUV + off); memset(dst_y_opt, 255, width * height + off); memset(dst_u_opt, 255, kSizeUV + off); memset(dst_v_opt, 255, kSizeUV + off); MaskCpuFlags(disable_cpu_flags); I420Blend(src_y0 + off, width, src_u0 + off, kStrideUV, src_v0 + off, kStrideUV, src_y1 + off, width, src_u1 + off, kStrideUV, src_v1 + off, kStrideUV, src_a + off, width, dst_y_c + off, width, dst_u_c + off, kStrideUV, dst_v_c + off, kStrideUV, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { I420Blend(src_y0 + off, width, src_u0 + off, kStrideUV, src_v0 + off, kStrideUV, src_y1 + off, width, src_u1 + off, kStrideUV, src_v1 + off, kStrideUV, src_a + off, width, dst_y_opt + off, width, dst_u_opt + off, kStrideUV, dst_v_opt + off, kStrideUV, width, invert * height); } for (int i = 0; i < width * height; ++i) { EXPECT_EQ(dst_y_c[i + off], dst_y_opt[i + off]); } for (int i = 0; i < kSizeUV; ++i) { EXPECT_EQ(dst_u_c[i + off], dst_u_opt[i + off]); EXPECT_EQ(dst_v_c[i + off], dst_v_opt[i + off]); } free_aligned_buffer_page_end(src_y0); free_aligned_buffer_page_end(src_u0); free_aligned_buffer_page_end(src_v0); free_aligned_buffer_page_end(src_y1); free_aligned_buffer_page_end(src_u1); free_aligned_buffer_page_end(src_v1); free_aligned_buffer_page_end(src_a); free_aligned_buffer_page_end(dst_y_c); free_aligned_buffer_page_end(dst_u_c); free_aligned_buffer_page_end(dst_v_c); free_aligned_buffer_page_end(dst_y_opt); free_aligned_buffer_page_end(dst_u_opt); free_aligned_buffer_page_end(dst_v_opt); } TEST_F(LibYUVPlanarTest, I420Blend_Opt) { TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); } TEST_F(LibYUVPlanarTest, I420Blend_Unaligned) { TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); } // TODO(fbarchard): DISABLED because _Any uses C. Avoid C and re-enable. TEST_F(LibYUVPlanarTest, DISABLED_I420Blend_Any) { TestI420Blend(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); } TEST_F(LibYUVPlanarTest, I420Blend_Invert) { TestI420Blend(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); } TEST_F(LibYUVPlanarTest, TestAffine) { SIMD_ALIGNED(uint8_t orig_pixels_0[1280][4]); SIMD_ALIGNED(uint8_t interpolate_pixels_C[1280][4]); for (int i = 0; i < 1280; ++i) { for (int j = 0; j < 4; ++j) { orig_pixels_0[i][j] = i; } } float uv_step[4] = {0.f, 0.f, 0.75f, 0.f}; ARGBAffineRow_C(&orig_pixels_0[0][0], 0, &interpolate_pixels_C[0][0], uv_step, 1280); EXPECT_EQ(0u, interpolate_pixels_C[0][0]); EXPECT_EQ(96u, interpolate_pixels_C[128][0]); EXPECT_EQ(191u, interpolate_pixels_C[255][3]); #if defined(HAS_ARGBAFFINEROW_SSE2) SIMD_ALIGNED(uint8_t interpolate_pixels_Opt[1280][4]); ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0], uv_step, 1280); EXPECT_EQ(0, memcmp(interpolate_pixels_Opt, interpolate_pixels_C, 1280 * 4)); int has_sse2 = TestCpuFlag(kCpuHasSSE2); if (has_sse2) { for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBAffineRow_SSE2(&orig_pixels_0[0][0], 0, &interpolate_pixels_Opt[0][0], uv_step, 1280); } } #endif } TEST_F(LibYUVPlanarTest, TestCopyPlane) { int err = 0; int yw = benchmark_width_; int yh = benchmark_height_; int b = 12; int i, j; int y_plane_size = (yw + b * 2) * (yh + b * 2); align_buffer_page_end(orig_y, y_plane_size); align_buffer_page_end(dst_c, y_plane_size); align_buffer_page_end(dst_opt, y_plane_size); memset(orig_y, 0, y_plane_size); memset(dst_c, 0, y_plane_size); memset(dst_opt, 0, y_plane_size); // Fill image buffers with random data. for (i = b; i < (yh + b); ++i) { for (j = b; j < (yw + b); ++j) { orig_y[i * (yw + b * 2) + j] = fastrand() & 0xff; } } // Fill destination buffers with random data. for (i = 0; i < y_plane_size; ++i) { uint8_t random_number = fastrand() & 0x7f; dst_c[i] = random_number; dst_opt[i] = dst_c[i]; } int y_off = b * (yw + b * 2) + b; int y_st = yw + b * 2; int stride = 8; // Disable all optimizations. MaskCpuFlags(disable_cpu_flags_); for (j = 0; j < benchmark_iterations_; j++) { CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh); } // Enable optimizations. MaskCpuFlags(benchmark_cpu_info_); for (j = 0; j < benchmark_iterations_; j++) { CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh); } for (i = 0; i < y_plane_size; ++i) { if (dst_c[i] != dst_opt[i]) { ++err; } } free_aligned_buffer_page_end(orig_y); free_aligned_buffer_page_end(dst_c); free_aligned_buffer_page_end(dst_opt); EXPECT_EQ(0, err); } static int TestMultiply(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off) { if (width < 1) { width = 1; } const int kBpp = 4; const int kStride = width * kBpp; align_buffer_page_end(src_argb_a, kStride * height + off); align_buffer_page_end(src_argb_b, kStride * height + off); align_buffer_page_end(dst_argb_c, kStride * height); align_buffer_page_end(dst_argb_opt, kStride * height); for (int i = 0; i < kStride * height; ++i) { src_argb_a[i + off] = (fastrand() & 0xff); src_argb_b[i + off] = (fastrand() & 0xff); } memset(dst_argb_c, 0, kStride * height); memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); ARGBMultiply(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c, kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { ARGBMultiply(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i]) - static_cast(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(src_argb_a); free_aligned_buffer_page_end(src_argb_b); free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); return max_diff; } TEST_F(LibYUVPlanarTest, ARGBMultiply_Any) { int max_diff = TestMultiply(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBMultiply_Unaligned) { int max_diff = TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBMultiply_Invert) { int max_diff = TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBMultiply_Opt) { int max_diff = TestMultiply(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } static int TestAdd(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off) { if (width < 1) { width = 1; } const int kBpp = 4; const int kStride = width * kBpp; align_buffer_page_end(src_argb_a, kStride * height + off); align_buffer_page_end(src_argb_b, kStride * height + off); align_buffer_page_end(dst_argb_c, kStride * height); align_buffer_page_end(dst_argb_opt, kStride * height); for (int i = 0; i < kStride * height; ++i) { src_argb_a[i + off] = (fastrand() & 0xff); src_argb_b[i + off] = (fastrand() & 0xff); } memset(dst_argb_c, 0, kStride * height); memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); ARGBAdd(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c, kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { ARGBAdd(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i]) - static_cast(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(src_argb_a); free_aligned_buffer_page_end(src_argb_b); free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); return max_diff; } TEST_F(LibYUVPlanarTest, ARGBAdd_Any) { int max_diff = TestAdd(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBAdd_Unaligned) { int max_diff = TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBAdd_Invert) { int max_diff = TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBAdd_Opt) { int max_diff = TestAdd(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } static int TestSubtract(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off) { if (width < 1) { width = 1; } const int kBpp = 4; const int kStride = width * kBpp; align_buffer_page_end(src_argb_a, kStride * height + off); align_buffer_page_end(src_argb_b, kStride * height + off); align_buffer_page_end(dst_argb_c, kStride * height); align_buffer_page_end(dst_argb_opt, kStride * height); for (int i = 0; i < kStride * height; ++i) { src_argb_a[i + off] = (fastrand() & 0xff); src_argb_b[i + off] = (fastrand() & 0xff); } memset(dst_argb_c, 0, kStride * height); memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); ARGBSubtract(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_c, kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { ARGBSubtract(src_argb_a + off, kStride, src_argb_b + off, kStride, dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i]) - static_cast(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(src_argb_a); free_aligned_buffer_page_end(src_argb_b); free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); return max_diff; } TEST_F(LibYUVPlanarTest, ARGBSubtract_Any) { int max_diff = TestSubtract(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBSubtract_Unaligned) { int max_diff = TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBSubtract_Invert) { int max_diff = TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBSubtract_Opt) { int max_diff = TestSubtract(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_LE(max_diff, 1); } static int TestSobel(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off) { if (width < 1) { width = 1; } const int kBpp = 4; const int kStride = width * kBpp; align_buffer_page_end(src_argb_a, kStride * height + off); align_buffer_page_end(dst_argb_c, kStride * height); align_buffer_page_end(dst_argb_opt, kStride * height); memset(src_argb_a, 0, kStride * height + off); for (int i = 0; i < kStride * height; ++i) { src_argb_a[i + off] = (fastrand() & 0xff); } memset(dst_argb_c, 0, kStride * height); memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); ARGBSobel(src_argb_a + off, kStride, dst_argb_c, kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { ARGBSobel(src_argb_a + off, kStride, dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i]) - static_cast(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(src_argb_a); free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); return max_diff; } TEST_F(LibYUVPlanarTest, ARGBSobel_Any) { int max_diff = TestSobel(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobel_Unaligned) { int max_diff = TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobel_Invert) { int max_diff = TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobel_Opt) { int max_diff = TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } static int TestSobelToPlane(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off) { if (width < 1) { width = 1; } const int kSrcBpp = 4; const int kDstBpp = 1; const int kSrcStride = (width * kSrcBpp + 15) & ~15; const int kDstStride = (width * kDstBpp + 15) & ~15; align_buffer_page_end(src_argb_a, kSrcStride * height + off); align_buffer_page_end(dst_argb_c, kDstStride * height); align_buffer_page_end(dst_argb_opt, kDstStride * height); memset(src_argb_a, 0, kSrcStride * height + off); for (int i = 0; i < kSrcStride * height; ++i) { src_argb_a[i + off] = (fastrand() & 0xff); } memset(dst_argb_c, 0, kDstStride * height); memset(dst_argb_opt, 0, kDstStride * height); MaskCpuFlags(disable_cpu_flags); ARGBSobelToPlane(src_argb_a + off, kSrcStride, dst_argb_c, kDstStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { ARGBSobelToPlane(src_argb_a + off, kSrcStride, dst_argb_opt, kDstStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kDstStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i]) - static_cast(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(src_argb_a); free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); return max_diff; } TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Any) { int max_diff = TestSobelToPlane(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Unaligned) { int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Invert) { int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelToPlane_Opt) { int max_diff = TestSobelToPlane(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } static int TestSobelXY(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off) { if (width < 1) { width = 1; } const int kBpp = 4; const int kStride = width * kBpp; align_buffer_page_end(src_argb_a, kStride * height + off); align_buffer_page_end(dst_argb_c, kStride * height); align_buffer_page_end(dst_argb_opt, kStride * height); memset(src_argb_a, 0, kStride * height + off); for (int i = 0; i < kStride * height; ++i) { src_argb_a[i + off] = (fastrand() & 0xff); } memset(dst_argb_c, 0, kStride * height); memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); ARGBSobelXY(src_argb_a + off, kStride, dst_argb_c, kStride, width, invert * height); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { ARGBSobelXY(src_argb_a + off, kStride, dst_argb_opt, kStride, width, invert * height); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i]) - static_cast(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(src_argb_a); free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); return max_diff; } TEST_F(LibYUVPlanarTest, ARGBSobelXY_Any) { int max_diff = TestSobelXY(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelXY_Unaligned) { int max_diff = TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelXY_Invert) { int max_diff = TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBSobelXY_Opt) { int max_diff = TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); EXPECT_EQ(0, max_diff); } static int TestBlur(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off, int radius) { if (width < 1) { width = 1; } const int kBpp = 4; const int kStride = width * kBpp; align_buffer_page_end(src_argb_a, kStride * height + off); align_buffer_page_end(dst_cumsum, width * height * 16); align_buffer_page_end(dst_argb_c, kStride * height); align_buffer_page_end(dst_argb_opt, kStride * height); for (int i = 0; i < kStride * height; ++i) { src_argb_a[i + off] = (fastrand() & 0xff); } memset(dst_cumsum, 0, width * height * 16); memset(dst_argb_c, 0, kStride * height); memset(dst_argb_opt, 0, kStride * height); MaskCpuFlags(disable_cpu_flags); ARGBBlur(src_argb_a + off, kStride, dst_argb_c, kStride, reinterpret_cast(dst_cumsum), width * 4, width, invert * height, radius); MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { ARGBBlur(src_argb_a + off, kStride, dst_argb_opt, kStride, reinterpret_cast(dst_cumsum), width * 4, width, invert * height, radius); } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i]) - static_cast(dst_argb_opt[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(src_argb_a); free_aligned_buffer_page_end(dst_cumsum); free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); return max_diff; } #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) #define DISABLED_ARM(name) name #else #define DISABLED_ARM(name) DISABLED_##name #endif static const int kBlurSize = 55; TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Any)) { int max_diff = TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Unaligned)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Invert)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlur_Opt)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSize); EXPECT_LE(max_diff, 1); } static const int kBlurSmallSize = 5; TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Any)) { int max_diff = TestBlur(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Unaligned)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1, kBlurSmallSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Invert)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0, kBlurSmallSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, DISABLED_ARM(ARGBBlurSmall_Opt)) { int max_diff = TestBlur(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, kBlurSmallSize); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, DISABLED_ARM(TestARGBPolynomial)) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]); SIMD_ALIGNED(uint8_t dst_pixels_c[1280][4]); memset(orig_pixels, 0, sizeof(orig_pixels)); SIMD_ALIGNED(static const float kWarmifyPolynomial[16]) = { 0.94230f, -3.03300f, -2.92500f, 0.f, // C0 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x 0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x }; // Test blue orig_pixels[0][0] = 255u; orig_pixels[0][1] = 0u; orig_pixels[0][2] = 0u; orig_pixels[0][3] = 128u; // Test green orig_pixels[1][0] = 0u; orig_pixels[1][1] = 255u; orig_pixels[1][2] = 0u; orig_pixels[1][3] = 0u; // Test red orig_pixels[2][0] = 0u; orig_pixels[2][1] = 0u; orig_pixels[2][2] = 255u; orig_pixels[2][3] = 255u; // Test white orig_pixels[3][0] = 255u; orig_pixels[3][1] = 255u; orig_pixels[3][2] = 255u; orig_pixels[3][3] = 255u; // Test color orig_pixels[4][0] = 16u; orig_pixels[4][1] = 64u; orig_pixels[4][2] = 192u; orig_pixels[4][3] = 224u; // Do 16 to test asm version. ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, &kWarmifyPolynomial[0], 16, 1); EXPECT_EQ(235u, dst_pixels_opt[0][0]); EXPECT_EQ(0u, dst_pixels_opt[0][1]); EXPECT_EQ(0u, dst_pixels_opt[0][2]); EXPECT_EQ(128u, dst_pixels_opt[0][3]); EXPECT_EQ(0u, dst_pixels_opt[1][0]); EXPECT_EQ(233u, dst_pixels_opt[1][1]); EXPECT_EQ(0u, dst_pixels_opt[1][2]); EXPECT_EQ(0u, dst_pixels_opt[1][3]); EXPECT_EQ(0u, dst_pixels_opt[2][0]); EXPECT_EQ(0u, dst_pixels_opt[2][1]); EXPECT_EQ(241u, dst_pixels_opt[2][2]); EXPECT_EQ(255u, dst_pixels_opt[2][3]); EXPECT_EQ(235u, dst_pixels_opt[3][0]); EXPECT_EQ(233u, dst_pixels_opt[3][1]); EXPECT_EQ(241u, dst_pixels_opt[3][2]); EXPECT_EQ(255u, dst_pixels_opt[3][3]); EXPECT_EQ(10u, dst_pixels_opt[4][0]); EXPECT_EQ(59u, dst_pixels_opt[4][1]); EXPECT_EQ(188u, dst_pixels_opt[4][2]); EXPECT_EQ(224u, dst_pixels_opt[4][3]); for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } MaskCpuFlags(disable_cpu_flags_); ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0, &kWarmifyPolynomial[0], 1280, 1); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, &kWarmifyPolynomial[0], 1280, 1); } for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]); EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]); EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]); EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]); } } int TestHalfFloatPlane(int benchmark_width, int benchmark_height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, float scale, int mask) { int i, j; const int y_plane_size = benchmark_width * benchmark_height * 2; align_buffer_page_end(orig_y, y_plane_size * 3); uint8_t* dst_opt = orig_y + y_plane_size; uint8_t* dst_c = orig_y + y_plane_size * 2; MemRandomize(orig_y, y_plane_size); memset(dst_c, 0, y_plane_size); memset(dst_opt, 1, y_plane_size); for (i = 0; i < y_plane_size / 2; ++i) { reinterpret_cast(orig_y)[i] &= mask; } // Disable all optimizations. MaskCpuFlags(disable_cpu_flags); for (j = 0; j < benchmark_iterations; j++) { HalfFloatPlane(reinterpret_cast(orig_y), benchmark_width * 2, reinterpret_cast(dst_c), benchmark_width * 2, scale, benchmark_width, benchmark_height); } // Enable optimizations. MaskCpuFlags(benchmark_cpu_info); for (j = 0; j < benchmark_iterations; j++) { HalfFloatPlane(reinterpret_cast(orig_y), benchmark_width * 2, reinterpret_cast(dst_opt), benchmark_width * 2, scale, benchmark_width, benchmark_height); } int max_diff = 0; for (i = 0; i < y_plane_size / 2; ++i) { int abs_diff = abs(static_cast(reinterpret_cast(dst_c)[i]) - static_cast(reinterpret_cast(dst_opt)[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(orig_y); return max_diff; } #if defined(__arm__) static void EnableFlushDenormalToZero(void) { uint32_t cw; __asm__ __volatile__( "vmrs %0, fpscr \n" "orr %0, %0, #0x1000000 \n" "vmsr fpscr, %0 \n" : "=r"(cw)::"memory"); } #endif // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes // exponent to be less than 0. 15 - log2(65536) = -1/ This shouldnt normally // happen since scale is 1/(1<(orig_y + y_plane_size); float* dst_c = reinterpret_cast(orig_y + y_plane_size * 5); MemRandomize(orig_y, y_plane_size); memset(dst_c, 0, y_plane_size * 4); memset(dst_opt, 1, y_plane_size * 4); // Disable all optimizations. MaskCpuFlags(disable_cpu_flags); ByteToFloat(orig_y, dst_c, scale, y_plane_size); // Enable optimizations. MaskCpuFlags(benchmark_cpu_info); for (j = 0; j < benchmark_iterations; j++) { ByteToFloat(orig_y, dst_opt, scale, y_plane_size); } float max_diff = 0; for (i = 0; i < y_plane_size; ++i) { float abs_diff = fabs(dst_c[i] - dst_opt[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(orig_y); return max_diff; } TEST_F(LibYUVPlanarTest, TestByteToFloat) { float diff = TestByteToFloat(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, 1.0f); EXPECT_EQ(0.f, diff); } TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]); SIMD_ALIGNED(uint8_t dst_pixels_c[1280][4]); memset(orig_pixels, 0, sizeof(orig_pixels)); align_buffer_page_end(lumacolortable, 32768); int v = 0; for (int i = 0; i < 32768; ++i) { lumacolortable[i] = v; v += 3; } // Test blue orig_pixels[0][0] = 255u; orig_pixels[0][1] = 0u; orig_pixels[0][2] = 0u; orig_pixels[0][3] = 128u; // Test green orig_pixels[1][0] = 0u; orig_pixels[1][1] = 255u; orig_pixels[1][2] = 0u; orig_pixels[1][3] = 0u; // Test red orig_pixels[2][0] = 0u; orig_pixels[2][1] = 0u; orig_pixels[2][2] = 255u; orig_pixels[2][3] = 255u; // Test color orig_pixels[3][0] = 16u; orig_pixels[3][1] = 64u; orig_pixels[3][2] = 192u; orig_pixels[3][3] = 224u; // Do 16 to test asm version. ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, &lumacolortable[0], 16, 1); EXPECT_EQ(253u, dst_pixels_opt[0][0]); EXPECT_EQ(0u, dst_pixels_opt[0][1]); EXPECT_EQ(0u, dst_pixels_opt[0][2]); EXPECT_EQ(128u, dst_pixels_opt[0][3]); EXPECT_EQ(0u, dst_pixels_opt[1][0]); EXPECT_EQ(253u, dst_pixels_opt[1][1]); EXPECT_EQ(0u, dst_pixels_opt[1][2]); EXPECT_EQ(0u, dst_pixels_opt[1][3]); EXPECT_EQ(0u, dst_pixels_opt[2][0]); EXPECT_EQ(0u, dst_pixels_opt[2][1]); EXPECT_EQ(253u, dst_pixels_opt[2][2]); EXPECT_EQ(255u, dst_pixels_opt[2][3]); EXPECT_EQ(48u, dst_pixels_opt[3][0]); EXPECT_EQ(192u, dst_pixels_opt[3][1]); EXPECT_EQ(64u, dst_pixels_opt[3][2]); EXPECT_EQ(224u, dst_pixels_opt[3][3]); for (int i = 0; i < 1280; ++i) { orig_pixels[i][0] = i; orig_pixels[i][1] = i / 2; orig_pixels[i][2] = i / 3; orig_pixels[i][3] = i; } MaskCpuFlags(disable_cpu_flags_); ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_c[0][0], 0, lumacolortable, 1280, 1); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { ARGBLumaColorTable(&orig_pixels[0][0], 0, &dst_pixels_opt[0][0], 0, lumacolortable, 1280, 1); } for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i][0], dst_pixels_opt[i][0]); EXPECT_EQ(dst_pixels_c[i][1], dst_pixels_opt[i][1]); EXPECT_EQ(dst_pixels_c[i][2], dst_pixels_opt[i][2]); EXPECT_EQ(dst_pixels_c[i][3], dst_pixels_opt[i][3]); } free_aligned_buffer_page_end(lumacolortable); } TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) { const int kSize = benchmark_width_ * benchmark_height_ * 4; align_buffer_page_end(orig_pixels, kSize); align_buffer_page_end(dst_pixels_opt, kSize); align_buffer_page_end(dst_pixels_c, kSize); MemRandomize(orig_pixels, kSize); MemRandomize(dst_pixels_opt, kSize); memcpy(dst_pixels_c, dst_pixels_opt, kSize); MaskCpuFlags(disable_cpu_flags_); ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4, dst_pixels_c, benchmark_width_ * 4, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { ARGBCopyAlpha(orig_pixels, benchmark_width_ * 4, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, benchmark_height_); } for (int i = 0; i < kSize; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(dst_pixels_c); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(orig_pixels); } TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(dst_pixels_opt, kPixels); align_buffer_page_end(dst_pixels_c, kPixels); MemRandomize(src_pixels, kPixels * 4); MemRandomize(dst_pixels_opt, kPixels); memcpy(dst_pixels_c, dst_pixels_opt, kPixels); MaskCpuFlags(disable_cpu_flags_); ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_c, benchmark_width_, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { ARGBExtractAlpha(src_pixels, benchmark_width_ * 4, dst_pixels_opt, benchmark_width_, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(dst_pixels_c); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(src_pixels); } TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(orig_pixels, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 4); align_buffer_page_end(dst_pixels_c, kPixels * 4); MemRandomize(orig_pixels, kPixels); MemRandomize(dst_pixels_opt, kPixels * 4); memcpy(dst_pixels_c, dst_pixels_opt, kPixels * 4); MaskCpuFlags(disable_cpu_flags_); ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_c, benchmark_width_ * 4, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { ARGBCopyYToAlpha(orig_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels * 4; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(dst_pixels_c); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(orig_pixels); } static int TestARGBRect(int width, int height, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, int invert, int off, int bpp) { if (width < 1) { width = 1; } const int kStride = width * bpp; const int kSize = kStride * height; const uint32_t v32 = fastrand() & (bpp == 4 ? 0xffffffff : 0xff); align_buffer_page_end(dst_argb_c, kSize + off); align_buffer_page_end(dst_argb_opt, kSize + off); MemRandomize(dst_argb_c + off, kSize); memcpy(dst_argb_opt + off, dst_argb_c + off, kSize); MaskCpuFlags(disable_cpu_flags); if (bpp == 4) { ARGBRect(dst_argb_c + off, kStride, 0, 0, width, invert * height, v32); } else { SetPlane(dst_argb_c + off, kStride, width, invert * height, v32); } MaskCpuFlags(benchmark_cpu_info); for (int i = 0; i < benchmark_iterations; ++i) { if (bpp == 4) { ARGBRect(dst_argb_opt + off, kStride, 0, 0, width, invert * height, v32); } else { SetPlane(dst_argb_opt + off, kStride, width, invert * height, v32); } } int max_diff = 0; for (int i = 0; i < kStride * height; ++i) { int abs_diff = abs(static_cast(dst_argb_c[i + off]) - static_cast(dst_argb_opt[i + off])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); return max_diff; } TEST_F(LibYUVPlanarTest, ARGBRect_Any) { int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 4); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBRect_Unaligned) { int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 4); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBRect_Invert) { int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 4); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, ARGBRect_Opt) { int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 4); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, SetPlane_Any) { int max_diff = TestARGBRect(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, SetPlane_Unaligned) { int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, SetPlane_Invert) { int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, SetPlane_Opt) { int max_diff = TestARGBRect(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1); EXPECT_EQ(0, max_diff); } TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels_u, kPixels); align_buffer_page_end(src_pixels_v, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 2); align_buffer_page_end(dst_pixels_c, kPixels * 2); MemRandomize(src_pixels_u, kPixels); MemRandomize(src_pixels_v, kPixels); MemRandomize(dst_pixels_opt, kPixels * 2); MemRandomize(dst_pixels_c, kPixels * 2); MaskCpuFlags(disable_cpu_flags_); MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_, dst_pixels_c, benchmark_width_ * 2, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { MergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_, dst_pixels_opt, benchmark_width_ * 2, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels * 2; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels_u); free_aligned_buffer_page_end(src_pixels_v); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } // 16 bit channel split and merge TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels_u, kPixels * 2); align_buffer_page_end(src_pixels_v, kPixels * 2); align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2); align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2); MemRandomize(src_pixels_u, kPixels * 2); MemRandomize(src_pixels_v, kPixels * 2); MemRandomize(dst_pixels_opt, kPixels * 2 * 2); MemRandomize(dst_pixels_c, kPixels * 2 * 2); MaskCpuFlags(disable_cpu_flags_); MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_, (const uint16_t*)src_pixels_v, benchmark_width_, (uint16_t*)dst_pixels_c, benchmark_width_ * 2, benchmark_width_, benchmark_height_, 12); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { MergeUVPlane_16((const uint16_t*)src_pixels_u, benchmark_width_, (const uint16_t*)src_pixels_v, benchmark_width_, (uint16_t*)dst_pixels_opt, benchmark_width_ * 2, benchmark_width_, benchmark_height_, 12); } for (int i = 0; i < kPixels * 2 * 2; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels_u); free_aligned_buffer_page_end(src_pixels_v); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 2); align_buffer_page_end(dst_pixels_u_c, kPixels); align_buffer_page_end(dst_pixels_v_c, kPixels); align_buffer_page_end(dst_pixels_u_opt, kPixels); align_buffer_page_end(dst_pixels_v_opt, kPixels); MemRandomize(src_pixels, kPixels * 2); MemRandomize(dst_pixels_u_c, kPixels); MemRandomize(dst_pixels_v_c, kPixels); MemRandomize(dst_pixels_u_opt, kPixels); MemRandomize(dst_pixels_v_opt, kPixels); MaskCpuFlags(disable_cpu_flags_); SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_c, benchmark_width_, dst_pixels_v_c, benchmark_width_, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { SplitUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_u_opt, benchmark_width_, dst_pixels_v_opt, benchmark_width_, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels; ++i) { EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]); EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(dst_pixels_u_c); free_aligned_buffer_page_end(dst_pixels_v_c); free_aligned_buffer_page_end(dst_pixels_u_opt); free_aligned_buffer_page_end(dst_pixels_v_opt); } // 16 bit channel split TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 2 * 2); align_buffer_page_end(dst_pixels_u_c, kPixels * 2); align_buffer_page_end(dst_pixels_v_c, kPixels * 2); align_buffer_page_end(dst_pixels_u_opt, kPixels * 2); align_buffer_page_end(dst_pixels_v_opt, kPixels * 2); MemRandomize(src_pixels, kPixels * 2 * 2); MemRandomize(dst_pixels_u_c, kPixels * 2); MemRandomize(dst_pixels_v_c, kPixels * 2); MemRandomize(dst_pixels_u_opt, kPixels * 2); MemRandomize(dst_pixels_v_opt, kPixels * 2); MaskCpuFlags(disable_cpu_flags_); SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2, (uint16_t*)dst_pixels_u_c, benchmark_width_, (uint16_t*)dst_pixels_v_c, benchmark_width_, benchmark_width_, benchmark_height_, 10); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2, (uint16_t*)dst_pixels_u_opt, benchmark_width_, (uint16_t*)dst_pixels_v_opt, benchmark_width_, benchmark_width_, benchmark_height_, 10); } for (int i = 0; i < kPixels * 2; ++i) { EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]); EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(dst_pixels_u_c); free_aligned_buffer_page_end(dst_pixels_v_c); free_aligned_buffer_page_end(dst_pixels_u_opt); free_aligned_buffer_page_end(dst_pixels_v_opt); } TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) { // Round count up to multiple of 16 const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 2); align_buffer_page_end(dst_pixels_opt, kPixels * 2); align_buffer_page_end(dst_pixels_c, kPixels * 2); MemRandomize(src_pixels, kPixels * 2); MemRandomize(dst_pixels_opt, kPixels * 2); MemRandomize(dst_pixels_c, kPixels * 2); MaskCpuFlags(disable_cpu_flags_); SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c, benchmark_width_ * 2, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt, benchmark_width_ * 2, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels * 2; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) { // Round count up to multiple of 16 const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 3); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); align_buffer_page_end(tmp_pixels_b, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 3); align_buffer_page_end(dst_pixels_c, kPixels * 3); MemRandomize(src_pixels, kPixels * 3); MemRandomize(tmp_pixels_r, kPixels); MemRandomize(tmp_pixels_g, kPixels); MemRandomize(tmp_pixels_b, kPixels); MemRandomize(dst_pixels_opt, kPixels * 3); MemRandomize(dst_pixels_c, kPixels * 3); MaskCpuFlags(disable_cpu_flags_); SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, benchmark_width_, benchmark_height_); MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, dst_pixels_c, benchmark_width_ * 3, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, benchmark_width_, benchmark_height_); for (int i = 0; i < benchmark_iterations_; ++i) { MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, dst_pixels_opt, benchmark_width_ * 3, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels * 3; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(tmp_pixels_r); free_aligned_buffer_page_end(tmp_pixels_g); free_aligned_buffer_page_end(tmp_pixels_b); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) { // Round count up to multiple of 16 const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 3); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); align_buffer_page_end(tmp_pixels_b, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 3); align_buffer_page_end(dst_pixels_c, kPixels * 3); MemRandomize(src_pixels, kPixels * 3); MemRandomize(tmp_pixels_r, kPixels); MemRandomize(tmp_pixels_g, kPixels); MemRandomize(tmp_pixels_b, kPixels); MemRandomize(dst_pixels_opt, kPixels * 3); MemRandomize(dst_pixels_c, kPixels * 3); MaskCpuFlags(disable_cpu_flags_); SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, benchmark_width_, benchmark_height_); MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, dst_pixels_c, benchmark_width_ * 3, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, benchmark_width_, benchmark_height_); } MergeRGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, dst_pixels_opt, benchmark_width_ * 3, benchmark_width_, benchmark_height_); for (int i = 0; i < kPixels * 3; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(tmp_pixels_r); free_aligned_buffer_page_end(tmp_pixels_g); free_aligned_buffer_page_end(tmp_pixels_b); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, MergeARGBPlane_Opt) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); align_buffer_page_end(tmp_pixels_b, kPixels); align_buffer_page_end(tmp_pixels_a, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 4); align_buffer_page_end(dst_pixels_c, kPixels * 4); MemRandomize(src_pixels, kPixels * 4); MemRandomize(tmp_pixels_r, kPixels); MemRandomize(tmp_pixels_g, kPixels); MemRandomize(tmp_pixels_b, kPixels); MemRandomize(tmp_pixels_a, kPixels); MemRandomize(dst_pixels_opt, kPixels * 4); MemRandomize(dst_pixels_c, kPixels * 4); MaskCpuFlags(disable_cpu_flags_); SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_, benchmark_width_, benchmark_height_); MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_, dst_pixels_c, benchmark_width_ * 4, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_, benchmark_width_, benchmark_height_); for (int i = 0; i < benchmark_iterations_; ++i) { MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels * 4; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(tmp_pixels_r); free_aligned_buffer_page_end(tmp_pixels_g); free_aligned_buffer_page_end(tmp_pixels_b); free_aligned_buffer_page_end(tmp_pixels_a); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, SplitARGBPlane_Opt) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); align_buffer_page_end(tmp_pixels_b, kPixels); align_buffer_page_end(tmp_pixels_a, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 4); align_buffer_page_end(dst_pixels_c, kPixels * 4); MemRandomize(src_pixels, kPixels * 4); MemRandomize(tmp_pixels_r, kPixels); MemRandomize(tmp_pixels_g, kPixels); MemRandomize(tmp_pixels_b, kPixels); MemRandomize(tmp_pixels_a, kPixels); MemRandomize(dst_pixels_opt, kPixels * 4); MemRandomize(dst_pixels_c, kPixels * 4); MaskCpuFlags(disable_cpu_flags_); SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_, benchmark_width_, benchmark_height_); MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_, dst_pixels_c, benchmark_width_ * 4, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_, benchmark_width_, benchmark_height_); } MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, tmp_pixels_a, benchmark_width_, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, benchmark_height_); for (int i = 0; i < kPixels * 4; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(tmp_pixels_r); free_aligned_buffer_page_end(tmp_pixels_g); free_aligned_buffer_page_end(tmp_pixels_b); free_aligned_buffer_page_end(tmp_pixels_a); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); align_buffer_page_end(tmp_pixels_b, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 4); align_buffer_page_end(dst_pixels_c, kPixels * 4); MemRandomize(src_pixels, kPixels * 4); MemRandomize(tmp_pixels_r, kPixels); MemRandomize(tmp_pixels_g, kPixels); MemRandomize(tmp_pixels_b, kPixels); MemRandomize(dst_pixels_opt, kPixels * 4); MemRandomize(dst_pixels_c, kPixels * 4); MaskCpuFlags(disable_cpu_flags_); SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0, benchmark_width_, benchmark_height_); MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_c, benchmark_width_ * 4, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0, benchmark_width_, benchmark_height_); for (int i = 0; i < benchmark_iterations_; ++i) { MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels * 4; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(tmp_pixels_r); free_aligned_buffer_page_end(tmp_pixels_g); free_aligned_buffer_page_end(tmp_pixels_b); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels, kPixels * 4); align_buffer_page_end(tmp_pixels_r, kPixels); align_buffer_page_end(tmp_pixels_g, kPixels); align_buffer_page_end(tmp_pixels_b, kPixels); align_buffer_page_end(dst_pixels_opt, kPixels * 4); align_buffer_page_end(dst_pixels_c, kPixels * 4); MemRandomize(src_pixels, kPixels * 4); MemRandomize(tmp_pixels_r, kPixels); MemRandomize(tmp_pixels_g, kPixels); MemRandomize(tmp_pixels_b, kPixels); MemRandomize(dst_pixels_opt, kPixels * 4); MemRandomize(dst_pixels_c, kPixels * 4); MaskCpuFlags(disable_cpu_flags_); SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0, benchmark_width_, benchmark_height_); MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_c, benchmark_width_ * 4, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0, benchmark_width_, benchmark_height_); } MergeARGBPlane(tmp_pixels_r, benchmark_width_, tmp_pixels_g, benchmark_width_, tmp_pixels_b, benchmark_width_, NULL, 0, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, benchmark_height_); for (int i = 0; i < kPixels * 4; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(tmp_pixels_r); free_aligned_buffer_page_end(tmp_pixels_g); free_aligned_buffer_page_end(tmp_pixels_b); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(dst_pixels_c); } // Merge 4 channels #define TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \ TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \ const int kWidth = W1280; \ const int kPixels = kWidth * benchmark_height_; \ align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_a, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ MemRandomize(src_memory_a, kPixels * sizeof(STYPE) + OFF); \ memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE)); \ memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE)); \ STYPE* src_pixels_r = reinterpret_cast(src_memory_r + OFF); \ STYPE* src_pixels_g = reinterpret_cast(src_memory_g + OFF); \ STYPE* src_pixels_b = reinterpret_cast(src_memory_b + OFF); \ STYPE* src_pixels_a = reinterpret_cast(src_memory_a + OFF); \ DTYPE* dst_pixels_c = reinterpret_cast(dst_memory_c); \ DTYPE* dst_pixels_opt = reinterpret_cast(dst_memory_opt); \ MaskCpuFlags(disable_cpu_flags_); \ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ kWidth, src_pixels_a, kWidth, dst_pixels_c, kWidth * 4, \ kWidth, NEG benchmark_height_, DEPTH); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ kWidth, src_pixels_a, kWidth, dst_pixels_opt, kWidth * 4, \ kWidth, NEG benchmark_height_, DEPTH); \ } \ for (int i = 0; i < kPixels * 4; ++i) { \ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ } \ free_aligned_buffer_page_end(src_memory_r); \ free_aligned_buffer_page_end(src_memory_g); \ free_aligned_buffer_page_end(src_memory_b); \ free_aligned_buffer_page_end(src_memory_a); \ free_aligned_buffer_page_end(dst_memory_c); \ free_aligned_buffer_page_end(dst_memory_opt); \ } // Merge 3 channel RGB into 4 channel XRGB with opaque alpha #define TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \ TEST_F(LibYUVPlanarTest, FUNC##Plane_Opaque_##DEPTH##N) { \ const int kWidth = W1280; \ const int kPixels = kWidth * benchmark_height_; \ align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ memset(dst_memory_c, 0, kPixels * 4 * sizeof(DTYPE)); \ memset(dst_memory_opt, 0, kPixels * 4 * sizeof(DTYPE)); \ STYPE* src_pixels_r = reinterpret_cast(src_memory_r + OFF); \ STYPE* src_pixels_g = reinterpret_cast(src_memory_g + OFF); \ STYPE* src_pixels_b = reinterpret_cast(src_memory_b + OFF); \ DTYPE* dst_pixels_c = reinterpret_cast(dst_memory_c); \ DTYPE* dst_pixels_opt = reinterpret_cast(dst_memory_opt); \ MaskCpuFlags(disable_cpu_flags_); \ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ kWidth, NULL, 0, dst_pixels_c, kWidth * 4, kWidth, \ NEG benchmark_height_, DEPTH); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ kWidth, NULL, 0, dst_pixels_opt, kWidth * 4, kWidth, \ NEG benchmark_height_, DEPTH); \ } \ for (int i = 0; i < kPixels * 4; ++i) { \ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ } \ free_aligned_buffer_page_end(src_memory_r); \ free_aligned_buffer_page_end(src_memory_g); \ free_aligned_buffer_page_end(src_memory_b); \ free_aligned_buffer_page_end(dst_memory_c); \ free_aligned_buffer_page_end(dst_memory_opt); \ } #define TESTQPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \ TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, 0) \ TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \ 2) \ TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \ TESTQPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) \ TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, \ 0) \ TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \ 2) \ TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \ TESTQPLANAROTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 10) TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 12) TESTQPLANARTOP(MergeAR64, uint16_t, uint16_t, 16) TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 10) TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 12) TESTQPLANARTOP(MergeARGB16To8, uint16_t, uint8_t, 16) #define TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, W1280, N, NEG, OFF) \ TEST_F(LibYUVPlanarTest, FUNC##Plane_##DEPTH##N) { \ const int kWidth = W1280; \ const int kPixels = kWidth * benchmark_height_; \ align_buffer_page_end(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ align_buffer_page_end(dst_memory_c, kPixels * 4 * sizeof(DTYPE)); \ align_buffer_page_end(dst_memory_opt, kPixels * 4 * sizeof(DTYPE)); \ MemRandomize(src_memory_r, kPixels * sizeof(STYPE) + OFF); \ MemRandomize(src_memory_g, kPixels * sizeof(STYPE) + OFF); \ MemRandomize(src_memory_b, kPixels * sizeof(STYPE) + OFF); \ STYPE* src_pixels_r = reinterpret_cast(src_memory_r + OFF); \ STYPE* src_pixels_g = reinterpret_cast(src_memory_g + OFF); \ STYPE* src_pixels_b = reinterpret_cast(src_memory_b + OFF); \ DTYPE* dst_pixels_c = reinterpret_cast(dst_memory_c); \ DTYPE* dst_pixels_opt = reinterpret_cast(dst_memory_opt); \ memset(dst_pixels_c, 1, kPixels * 4 * sizeof(DTYPE)); \ memset(dst_pixels_opt, 2, kPixels * 4 * sizeof(DTYPE)); \ MaskCpuFlags(disable_cpu_flags_); \ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ kWidth, dst_pixels_c, kWidth * 4, kWidth, \ NEG benchmark_height_, DEPTH); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ FUNC##Plane(src_pixels_r, kWidth, src_pixels_g, kWidth, src_pixels_b, \ kWidth, dst_pixels_opt, kWidth * 4, kWidth, \ NEG benchmark_height_, DEPTH); \ } \ for (int i = 0; i < kPixels * 4; ++i) { \ EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); \ } \ free_aligned_buffer_page_end(src_memory_r); \ free_aligned_buffer_page_end(src_memory_g); \ free_aligned_buffer_page_end(src_memory_b); \ free_aligned_buffer_page_end(dst_memory_c); \ free_aligned_buffer_page_end(dst_memory_opt); \ } #define TESTTPLANARTOP(FUNC, STYPE, DTYPE, DEPTH) \ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_ + 1, _Any, +, 0) \ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Unaligned, +, \ 2) \ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Invert, -, 0) \ TESTTPLANARTOPI(FUNC, STYPE, DTYPE, DEPTH, benchmark_width_, _Opt, +, 0) TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 10) TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 12) TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16) // TODO(fbarchard): improve test for platforms and cpu detect #ifdef HAS_MERGEUVROW_16_AVX2 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) { // Round count up to multiple of 16 const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; align_buffer_page_end(src_pixels_u, kPixels * 2); align_buffer_page_end(src_pixels_v, kPixels * 2); align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2); align_buffer_page_end(dst_pixels_uv_c, kPixels * 2 * 2); MemRandomize(src_pixels_u, kPixels * 2); MemRandomize(src_pixels_v, kPixels * 2); memset(dst_pixels_uv_opt, 0, kPixels * 2 * 2); memset(dst_pixels_uv_c, 1, kPixels * 2 * 2); MergeUVRow_16_C(reinterpret_cast(src_pixels_u), reinterpret_cast(src_pixels_v), reinterpret_cast(dst_pixels_uv_c), 64, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { MergeUVRow_16_AVX2(reinterpret_cast(src_pixels_u), reinterpret_cast(src_pixels_v), reinterpret_cast(dst_pixels_uv_opt), 64, kPixels); } else { MergeUVRow_16_C(reinterpret_cast(src_pixels_u), reinterpret_cast(src_pixels_v), reinterpret_cast(dst_pixels_uv_opt), 64, kPixels); } } for (int i = 0; i < kPixels * 2 * 2; ++i) { EXPECT_EQ(dst_pixels_uv_opt[i], dst_pixels_uv_c[i]); } free_aligned_buffer_page_end(src_pixels_u); free_aligned_buffer_page_end(src_pixels_v); free_aligned_buffer_page_end(dst_pixels_uv_opt); free_aligned_buffer_page_end(dst_pixels_uv_c); } #endif // TODO(fbarchard): Improve test for more platforms. #ifdef HAS_MULTIPLYROW_16_AVX2 TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) { // Round count up to multiple of 32 const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31; align_buffer_page_end(src_pixels_y, kPixels * 2); align_buffer_page_end(dst_pixels_y_opt, kPixels * 2); align_buffer_page_end(dst_pixels_y_c, kPixels * 2); MemRandomize(src_pixels_y, kPixels * 2); memset(dst_pixels_y_opt, 0, kPixels * 2); memset(dst_pixels_y_c, 1, kPixels * 2); MultiplyRow_16_C(reinterpret_cast(src_pixels_y), reinterpret_cast(dst_pixels_y_c), 64, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { MultiplyRow_16_AVX2(reinterpret_cast(src_pixels_y), reinterpret_cast(dst_pixels_y_opt), 64, kPixels); } else { MultiplyRow_16_C(reinterpret_cast(src_pixels_y), reinterpret_cast(dst_pixels_y_opt), 64, kPixels); } } for (int i = 0; i < kPixels * 2; ++i) { EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]); } free_aligned_buffer_page_end(src_pixels_y); free_aligned_buffer_page_end(dst_pixels_y_opt); free_aligned_buffer_page_end(dst_pixels_y_c); } #endif // HAS_MULTIPLYROW_16_AVX2 TEST_F(LibYUVPlanarTest, Convert16To8Plane) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels_y, kPixels * 2); align_buffer_page_end(dst_pixels_y_opt, kPixels); align_buffer_page_end(dst_pixels_y_c, kPixels); MemRandomize(src_pixels_y, kPixels * 2); memset(dst_pixels_y_opt, 0, kPixels); memset(dst_pixels_y_c, 1, kPixels); MaskCpuFlags(disable_cpu_flags_); Convert16To8Plane(reinterpret_cast(src_pixels_y), benchmark_width_, dst_pixels_y_c, benchmark_width_, 16384, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { Convert16To8Plane(reinterpret_cast(src_pixels_y), benchmark_width_, dst_pixels_y_opt, benchmark_width_, 16384, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels; ++i) { EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]); } free_aligned_buffer_page_end(src_pixels_y); free_aligned_buffer_page_end(dst_pixels_y_opt); free_aligned_buffer_page_end(dst_pixels_y_c); } #ifdef ENABLE_ROW_TESTS // TODO(fbarchard): Improve test for more platforms. #ifdef HAS_CONVERT16TO8ROW_AVX2 TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) { // AVX2 does multiple of 32, so round count up const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31; align_buffer_page_end(src_pixels_y, kPixels * 2); align_buffer_page_end(dst_pixels_y_opt, kPixels); align_buffer_page_end(dst_pixels_y_c, kPixels); MemRandomize(src_pixels_y, kPixels * 2); // clamp source range to 10 bits. for (int i = 0; i < kPixels; ++i) { reinterpret_cast(src_pixels_y)[i] &= 1023; } memset(dst_pixels_y_opt, 0, kPixels); memset(dst_pixels_y_c, 1, kPixels); Convert16To8Row_C(reinterpret_cast(src_pixels_y), dst_pixels_y_c, 16384, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { Convert16To8Row_AVX2(reinterpret_cast(src_pixels_y), dst_pixels_y_opt, 16384, kPixels); } else if (has_ssse3) { Convert16To8Row_SSSE3(reinterpret_cast(src_pixels_y), dst_pixels_y_opt, 16384, kPixels); } else { Convert16To8Row_C(reinterpret_cast(src_pixels_y), dst_pixels_y_opt, 16384, kPixels); } } for (int i = 0; i < kPixels; ++i) { EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]); } free_aligned_buffer_page_end(src_pixels_y); free_aligned_buffer_page_end(dst_pixels_y_opt); free_aligned_buffer_page_end(dst_pixels_y_c); } #endif // HAS_CONVERT16TO8ROW_AVX2 #endif // ENABLE_ROW_TESTS TEST_F(LibYUVPlanarTest, Convert8To16Plane) { const int kPixels = benchmark_width_ * benchmark_height_; align_buffer_page_end(src_pixels_y, kPixels); align_buffer_page_end(dst_pixels_y_opt, kPixels * 2); align_buffer_page_end(dst_pixels_y_c, kPixels * 2); MemRandomize(src_pixels_y, kPixels); memset(dst_pixels_y_opt, 0, kPixels * 2); memset(dst_pixels_y_c, 1, kPixels * 2); MaskCpuFlags(disable_cpu_flags_); Convert8To16Plane(src_pixels_y, benchmark_width_, reinterpret_cast(dst_pixels_y_c), benchmark_width_, 1024, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { Convert8To16Plane(src_pixels_y, benchmark_width_, reinterpret_cast(dst_pixels_y_opt), benchmark_width_, 1024, benchmark_width_, benchmark_height_); } for (int i = 0; i < kPixels * 2; ++i) { EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]); } free_aligned_buffer_page_end(src_pixels_y); free_aligned_buffer_page_end(dst_pixels_y_opt); free_aligned_buffer_page_end(dst_pixels_y_c); } #ifdef ENABLE_ROW_TESTS // TODO(fbarchard): Improve test for more platforms. #ifdef HAS_CONVERT8TO16ROW_AVX2 TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) { const int kPixels = (benchmark_width_ * benchmark_height_ + 31) & ~31; align_buffer_page_end(src_pixels_y, kPixels); align_buffer_page_end(dst_pixels_y_opt, kPixels * 2); align_buffer_page_end(dst_pixels_y_c, kPixels * 2); MemRandomize(src_pixels_y, kPixels); memset(dst_pixels_y_opt, 0, kPixels * 2); memset(dst_pixels_y_c, 1, kPixels * 2); Convert8To16Row_C(src_pixels_y, reinterpret_cast(dst_pixels_y_c), 1024, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_sse2 = TestCpuFlag(kCpuHasSSE2); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { Convert8To16Row_AVX2(src_pixels_y, reinterpret_cast(dst_pixels_y_opt), 1024, kPixels); } else if (has_sse2) { Convert8To16Row_SSE2(src_pixels_y, reinterpret_cast(dst_pixels_y_opt), 1024, kPixels); } else { Convert8To16Row_C(src_pixels_y, reinterpret_cast(dst_pixels_y_opt), 1024, kPixels); } } for (int i = 0; i < kPixels * 2; ++i) { EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]); } free_aligned_buffer_page_end(src_pixels_y); free_aligned_buffer_page_end(dst_pixels_y_opt); free_aligned_buffer_page_end(dst_pixels_y_c); } #endif // HAS_CONVERT8TO16ROW_AVX2 float TestScaleMaxSamples(int benchmark_width, int benchmark_height, int benchmark_iterations, float scale, bool opt) { int i, j; float max_c, max_opt = 0.f; // NEON does multiple of 8, so round count up const int kPixels = (benchmark_width * benchmark_height + 7) & ~7; align_buffer_page_end(orig_y, kPixels * 4 * 3 + 48); uint8_t* dst_c = orig_y + kPixels * 4 + 16; uint8_t* dst_opt = orig_y + kPixels * 4 * 2 + 32; // Randomize works but may contain some denormals affecting performance. // MemRandomize(orig_y, kPixels * 4); // large values are problematic. audio is really -1 to 1. for (i = 0; i < kPixels; ++i) { (reinterpret_cast(orig_y))[i] = sinf(static_cast(i) * 0.1f); } memset(dst_c, 0, kPixels * 4); memset(dst_opt, 1, kPixels * 4); max_c = ScaleMaxSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_c), scale, kPixels); for (j = 0; j < benchmark_iterations; j++) { if (opt) { #ifdef HAS_SCALESUMSAMPLES_NEON max_opt = ScaleMaxSamples_NEON(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, kPixels); #else max_opt = ScaleMaxSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, kPixels); #endif } else { max_opt = ScaleMaxSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, kPixels); } } float max_diff = FAbs(max_opt - max_c); for (i = 0; i < kPixels; ++i) { float abs_diff = FAbs((reinterpret_cast(dst_c)[i]) - (reinterpret_cast(dst_opt)[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(orig_y); return max_diff; } TEST_F(LibYUVPlanarTest, TestScaleMaxSamples_C) { float diff = TestScaleMaxSamples(benchmark_width_, benchmark_height_, benchmark_iterations_, 1.2f, false); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestScaleMaxSamples_Opt) { float diff = TestScaleMaxSamples(benchmark_width_, benchmark_height_, benchmark_iterations_, 1.2f, true); EXPECT_EQ(0, diff); } float TestScaleSumSamples(int benchmark_width, int benchmark_height, int benchmark_iterations, float scale, bool opt) { int i, j; float sum_c, sum_opt = 0.f; // NEON does multiple of 8, so round count up const int kPixels = (benchmark_width * benchmark_height + 7) & ~7; align_buffer_page_end(orig_y, kPixels * 4 * 3); uint8_t* dst_c = orig_y + kPixels * 4; uint8_t* dst_opt = orig_y + kPixels * 4 * 2; // Randomize works but may contain some denormals affecting performance. // MemRandomize(orig_y, kPixels * 4); // large values are problematic. audio is really -1 to 1. for (i = 0; i < kPixels; ++i) { (reinterpret_cast(orig_y))[i] = sinf(static_cast(i) * 0.1f); } memset(dst_c, 0, kPixels * 4); memset(dst_opt, 1, kPixels * 4); sum_c = ScaleSumSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_c), scale, kPixels); for (j = 0; j < benchmark_iterations; j++) { if (opt) { #ifdef HAS_SCALESUMSAMPLES_NEON sum_opt = ScaleSumSamples_NEON(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, kPixels); #else sum_opt = ScaleSumSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, kPixels); #endif } else { sum_opt = ScaleSumSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, kPixels); } } float mse_opt = sum_opt / kPixels * 4; float mse_c = sum_c / kPixels * 4; float mse_error = FAbs(mse_opt - mse_c) / mse_c; // If the sum of a float is more than 4 million, small adds are round down on // float and produce different results with vectorized sum vs scalar sum. // Ignore the difference if the sum is large. float max_diff = 0.f; if (mse_error > 0.0001 && sum_c < 4000000) { // allow .01% difference of mse max_diff = mse_error; } for (i = 0; i < kPixels; ++i) { float abs_diff = FAbs((reinterpret_cast(dst_c)[i]) - (reinterpret_cast(dst_opt)[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(orig_y); return max_diff; } TEST_F(LibYUVPlanarTest, TestScaleSumSamples_C) { float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_, benchmark_iterations_, 1.2f, false); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestScaleSumSamples_Opt) { float diff = TestScaleSumSamples(benchmark_width_, benchmark_height_, benchmark_iterations_, 1.2f, true); EXPECT_EQ(0, diff); } float TestScaleSamples(int benchmark_width, int benchmark_height, int benchmark_iterations, float scale, bool opt) { int i, j; // NEON does multiple of 8, so round count up const int kPixels = (benchmark_width * benchmark_height + 7) & ~7; align_buffer_page_end(orig_y, kPixels * 4 * 3); uint8_t* dst_c = orig_y + kPixels * 4; uint8_t* dst_opt = orig_y + kPixels * 4 * 2; // Randomize works but may contain some denormals affecting performance. // MemRandomize(orig_y, kPixels * 4); // large values are problematic. audio is really -1 to 1. for (i = 0; i < kPixels; ++i) { (reinterpret_cast(orig_y))[i] = sinf(static_cast(i) * 0.1f); } memset(dst_c, 0, kPixels * 4); memset(dst_opt, 1, kPixels * 4); ScaleSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_c), scale, kPixels); for (j = 0; j < benchmark_iterations; j++) { if (opt) { #ifdef HAS_SCALESUMSAMPLES_NEON ScaleSamples_NEON(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, kPixels); #else ScaleSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, kPixels); #endif } else { ScaleSamples_C(reinterpret_cast(orig_y), reinterpret_cast(dst_opt), scale, kPixels); } } float max_diff = 0.f; for (i = 0; i < kPixels; ++i) { float abs_diff = FAbs((reinterpret_cast(dst_c)[i]) - (reinterpret_cast(dst_opt)[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(orig_y); return max_diff; } TEST_F(LibYUVPlanarTest, TestScaleSamples_C) { float diff = TestScaleSamples(benchmark_width_, benchmark_height_, benchmark_iterations_, 1.2f, false); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) { float diff = TestScaleSamples(benchmark_width_, benchmark_height_, benchmark_iterations_, 1.2f, true); EXPECT_EQ(0, diff); } float TestCopySamples(int benchmark_width, int benchmark_height, int benchmark_iterations, bool opt) { int i, j; // NEON does multiple of 16 floats, so round count up const int kPixels = (benchmark_width * benchmark_height + 15) & ~15; align_buffer_page_end(orig_y, kPixels * 4 * 3); uint8_t* dst_c = orig_y + kPixels * 4; uint8_t* dst_opt = orig_y + kPixels * 4 * 2; // Randomize works but may contain some denormals affecting performance. // MemRandomize(orig_y, kPixels * 4); // large values are problematic. audio is really -1 to 1. for (i = 0; i < kPixels; ++i) { (reinterpret_cast(orig_y))[i] = sinf(static_cast(i) * 0.1f); } memset(dst_c, 0, kPixels * 4); memset(dst_opt, 1, kPixels * 4); memcpy(reinterpret_cast(dst_c), reinterpret_cast(orig_y), kPixels * 4); for (j = 0; j < benchmark_iterations; j++) { if (opt) { #ifdef HAS_COPYROW_NEON CopyRow_NEON(orig_y, dst_opt, kPixels * 4); #else CopyRow_C(orig_y, dst_opt, kPixels * 4); #endif } else { CopyRow_C(orig_y, dst_opt, kPixels * 4); } } float max_diff = 0.f; for (i = 0; i < kPixels; ++i) { float abs_diff = FAbs((reinterpret_cast(dst_c)[i]) - (reinterpret_cast(dst_opt)[i])); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(orig_y); return max_diff; } TEST_F(LibYUVPlanarTest, TestCopySamples_C) { float diff = TestCopySamples(benchmark_width_, benchmark_height_, benchmark_iterations_, false); EXPECT_EQ(0, diff); } TEST_F(LibYUVPlanarTest, TestCopySamples_Opt) { float diff = TestCopySamples(benchmark_width_, benchmark_height_, benchmark_iterations_, true); EXPECT_EQ(0, diff); } extern "C" void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width); extern "C" void GaussRow_C(const uint32_t* src, uint16_t* dst, int width); TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) { SIMD_ALIGNED(uint32_t orig_pixels[1280 + 8]); SIMD_ALIGNED(uint16_t dst_pixels_c[1280]); SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]); memset(orig_pixels, 0, sizeof(orig_pixels)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); for (int i = 0; i < 1280 + 8; ++i) { orig_pixels[i] = i * 256; } GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) int has_neon = TestCpuFlag(kCpuHasNEON); if (has_neon) { GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); } else { GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); } #else GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); #endif } for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } EXPECT_EQ(dst_pixels_c[0], static_cast(0 * 1 + 1 * 4 + 2 * 6 + 3 * 4 + 4 * 1)); EXPECT_EQ(dst_pixels_c[639], static_cast(10256)); } extern "C" void GaussCol_NEON(const uint16_t* src0, const uint16_t* src1, const uint16_t* src2, const uint16_t* src3, const uint16_t* src4, uint32_t* dst, int width); extern "C" void GaussCol_C(const uint16_t* src0, const uint16_t* src1, const uint16_t* src2, const uint16_t* src3, const uint16_t* src4, uint32_t* dst, int width); TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { SIMD_ALIGNED(uint16_t orig_pixels[1280 * 5]); SIMD_ALIGNED(uint32_t dst_pixels_c[1280]); SIMD_ALIGNED(uint32_t dst_pixels_opt[1280]); memset(orig_pixels, 0, sizeof(orig_pixels)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); for (int i = 0; i < 1280 * 5; ++i) { orig_pixels[i] = static_cast(i); } GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0], 1280); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON)) int has_neon = TestCpuFlag(kCpuHasNEON); if (has_neon) { GaussCol_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280); } else { GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280); } #else GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280); #endif } for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } } TEST_F(LibYUVPlanarTest, TestGaussRow_F32_Opt) { SIMD_ALIGNED(float orig_pixels[1280 + 4]); SIMD_ALIGNED(float dst_pixels_c[1280]); SIMD_ALIGNED(float dst_pixels_opt[1280]); memset(orig_pixels, 0, sizeof(orig_pixels)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); for (int i = 0; i < 1280 + 4; ++i) { orig_pixels[i] = static_cast(i); } GaussRow_F32_C(&orig_pixels[0], &dst_pixels_c[0], 1280); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) int has_neon = TestCpuFlag(kCpuHasNEON); if (has_neon) { GaussRow_F32_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280); } else { GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); } #else GaussRow_F32_C(&orig_pixels[0], &dst_pixels_opt[0], 1280); #endif } for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } } TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) { SIMD_ALIGNED(float dst_pixels_c[1280]); SIMD_ALIGNED(float dst_pixels_opt[1280]); align_buffer_page_end(orig_pixels_buf, 1280 * 5 * 4); // 5 rows float* orig_pixels = reinterpret_cast(orig_pixels_buf); memset(orig_pixels, 0, 1280 * 5 * 4); memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); for (int i = 0; i < 1280 * 5; ++i) { orig_pixels[i] = static_cast(i); } GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_c[0], 1280); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) int has_neon = TestCpuFlag(kCpuHasNEON); if (has_neon) { GaussCol_F32_NEON(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280); } else { GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280); } #else GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2], &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280); #endif } for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } free_aligned_buffer_page_end(orig_pixels_buf); } TEST_F(LibYUVPlanarTest, SwapUVRow) { const int kPixels = benchmark_width_ * benchmark_height_; void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) = SwapUVRow_C; align_buffer_page_end(src_pixels_vu, kPixels * 2); align_buffer_page_end(dst_pixels_uv, kPixels * 2); MemRandomize(src_pixels_vu, kPixels * 2); memset(dst_pixels_uv, 1, kPixels * 2); #if defined(HAS_SWAPUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SwapUVRow = SwapUVRow_Any_NEON; if (IS_ALIGNED(kPixels, 16)) { SwapUVRow = SwapUVRow_NEON; } } #endif for (int j = 0; j < benchmark_iterations_; j++) { SwapUVRow(src_pixels_vu, dst_pixels_uv, kPixels); } for (int i = 0; i < kPixels; ++i) { EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]); EXPECT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]); } free_aligned_buffer_page_end(src_pixels_vu); free_aligned_buffer_page_end(dst_pixels_uv); } #endif // ENABLE_ROW_TESTS TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) { const int kSize = benchmark_width_ * benchmark_height_ * 4; align_buffer_page_end(orig_pixels, kSize); align_buffer_page_end(dst_pixels_opt, kSize); align_buffer_page_end(dst_pixels_c, kSize); for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { ((float*)(orig_pixels))[i] = (i & 1023) * 3.14f; } memset(dst_pixels_opt, 1, kSize); memset(dst_pixels_c, 2, kSize); MaskCpuFlags(disable_cpu_flags_); GaussPlane_F32((const float*)(orig_pixels), benchmark_width_, (float*)(dst_pixels_c), benchmark_width_, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { GaussPlane_F32((const float*)(orig_pixels), benchmark_width_, (float*)(dst_pixels_opt), benchmark_width_, benchmark_width_, benchmark_height_); } for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f) << i; } free_aligned_buffer_page_end(dst_pixels_c); free_aligned_buffer_page_end(dst_pixels_opt); free_aligned_buffer_page_end(orig_pixels); } TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) { int dst_width = (benchmark_width_ + 1) / 2; int dst_height = (benchmark_height_ + 1) / 2; align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_); align_buffer_page_end(src_pixels_v, benchmark_width_ * benchmark_height_); align_buffer_page_end(tmp_pixels_u, dst_width * dst_height); align_buffer_page_end(tmp_pixels_v, dst_width * dst_height); align_buffer_page_end(dst_pixels_uv_opt, dst_width * 2 * dst_height); align_buffer_page_end(dst_pixels_uv_c, dst_width * 2 * dst_height); MemRandomize(src_pixels_u, benchmark_width_ * benchmark_height_); MemRandomize(src_pixels_v, benchmark_width_ * benchmark_height_); MemRandomize(tmp_pixels_u, dst_width * dst_height); MemRandomize(tmp_pixels_v, dst_width * dst_height); MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height); MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height); MaskCpuFlags(disable_cpu_flags_); HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_, dst_pixels_uv_c, dst_width * 2, benchmark_width_, benchmark_height_); MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, benchmark_width_, dst_pixels_uv_opt, dst_width * 2, benchmark_width_, benchmark_height_); } for (int i = 0; i < dst_width * 2 * dst_height; ++i) { EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]); } free_aligned_buffer_page_end(src_pixels_u); free_aligned_buffer_page_end(src_pixels_v); free_aligned_buffer_page_end(tmp_pixels_u); free_aligned_buffer_page_end(tmp_pixels_v); free_aligned_buffer_page_end(dst_pixels_uv_opt); free_aligned_buffer_page_end(dst_pixels_uv_c); } TEST_F(LibYUVPlanarTest, NV12Copy) { const int halfwidth = (benchmark_width_ + 1) >> 1; const int halfheight = (benchmark_height_ + 1) >> 1; align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_); align_buffer_page_end(src_uv, halfwidth * 2 * halfheight); align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_); align_buffer_page_end(dst_uv, halfwidth * 2 * halfheight); MemRandomize(src_y, benchmark_width_ * benchmark_height_); MemRandomize(src_uv, halfwidth * 2 * halfheight); MemRandomize(dst_y, benchmark_width_ * benchmark_height_); MemRandomize(dst_uv, halfwidth * 2 * halfheight); for (int i = 0; i < benchmark_iterations_; ++i) { NV12Copy(src_y, benchmark_width_, src_uv, halfwidth * 2, dst_y, benchmark_width_, dst_uv, halfwidth * 2, benchmark_width_, benchmark_height_); } for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { EXPECT_EQ(src_y[i], dst_y[i]); } for (int i = 0; i < halfwidth * 2 * halfheight; ++i) { EXPECT_EQ(src_uv[i], dst_uv[i]); } free_aligned_buffer_page_end(src_y); free_aligned_buffer_page_end(src_uv); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_uv); } TEST_F(LibYUVPlanarTest, NV21Copy) { const int halfwidth = (benchmark_width_ + 1) >> 1; const int halfheight = (benchmark_height_ + 1) >> 1; align_buffer_page_end(src_y, benchmark_width_ * benchmark_height_); align_buffer_page_end(src_vu, halfwidth * 2 * halfheight); align_buffer_page_end(dst_y, benchmark_width_ * benchmark_height_); align_buffer_page_end(dst_vu, halfwidth * 2 * halfheight); MemRandomize(src_y, benchmark_width_ * benchmark_height_); MemRandomize(src_vu, halfwidth * 2 * halfheight); MemRandomize(dst_y, benchmark_width_ * benchmark_height_); MemRandomize(dst_vu, halfwidth * 2 * halfheight); for (int i = 0; i < benchmark_iterations_; ++i) { NV21Copy(src_y, benchmark_width_, src_vu, halfwidth * 2, dst_y, benchmark_width_, dst_vu, halfwidth * 2, benchmark_width_, benchmark_height_); } for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) { EXPECT_EQ(src_y[i], dst_y[i]); } for (int i = 0; i < halfwidth * 2 * halfheight; ++i) { EXPECT_EQ(src_vu[i], dst_vu[i]); } free_aligned_buffer_page_end(src_y); free_aligned_buffer_page_end(src_vu); free_aligned_buffer_page_end(dst_y); free_aligned_buffer_page_end(dst_vu); } } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/rotate_argb_test.cc000066400000000000000000000215671416500237200234160ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include "../unit_test/unit_test.h" #include "libyuv/cpu_id.h" #include "libyuv/rotate_argb.h" namespace libyuv { void TestRotateBpp(int src_width, int src_height, int dst_width, int dst_height, libyuv::RotationMode mode, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info, const int kBpp) { if (src_width < 1) { src_width = 1; } if (src_height < 1) { src_height = 1; } if (dst_width < 1) { dst_width = 1; } if (dst_height < 1) { dst_height = 1; } int src_stride_argb = src_width * kBpp; int src_argb_plane_size = src_stride_argb * abs(src_height); align_buffer_page_end(src_argb, src_argb_plane_size); for (int i = 0; i < src_argb_plane_size; ++i) { src_argb[i] = fastrand() & 0xff; } int dst_stride_argb = dst_width * kBpp; int dst_argb_plane_size = dst_stride_argb * dst_height; align_buffer_page_end(dst_argb_c, dst_argb_plane_size); align_buffer_page_end(dst_argb_opt, dst_argb_plane_size); memset(dst_argb_c, 2, dst_argb_plane_size); memset(dst_argb_opt, 3, dst_argb_plane_size); if (kBpp == 1) { MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. RotatePlane(src_argb, src_stride_argb, dst_argb_c, dst_stride_argb, src_width, src_height, mode); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations; ++i) { RotatePlane(src_argb, src_stride_argb, dst_argb_opt, dst_stride_argb, src_width, src_height, mode); } } else if (kBpp == 4) { MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. ARGBRotate(src_argb, src_stride_argb, dst_argb_c, dst_stride_argb, src_width, src_height, mode); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations; ++i) { ARGBRotate(src_argb, src_stride_argb, dst_argb_opt, dst_stride_argb, src_width, src_height, mode); } } // Rotation should be exact. for (int i = 0; i < dst_argb_plane_size; ++i) { EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); } free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); free_aligned_buffer_page_end(src_argb); } static void ARGBTestRotate(int src_width, int src_height, int dst_width, int dst_height, libyuv::RotationMode mode, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { TestRotateBpp(src_width, src_height, dst_width, dst_height, mode, benchmark_iterations, disable_cpu_flags, benchmark_cpu_info, 4); } TEST_F(LibYUVRotateTest, ARGBRotate0_Opt) { ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_width_, benchmark_height_, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, ARGBRotate90_Opt) { ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_height_, benchmark_width_, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, ARGBRotate180_Opt) { ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_width_, benchmark_height_, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, ARGBRotate270_Opt) { ARGBTestRotate(benchmark_width_, benchmark_height_, benchmark_height_, benchmark_width_, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } static void TestRotatePlane(int src_width, int src_height, int dst_width, int dst_height, libyuv::RotationMode mode, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { TestRotateBpp(src_width, src_height, dst_width, dst_height, mode, benchmark_iterations, disable_cpu_flags, benchmark_cpu_info, 1); } TEST_F(LibYUVRotateTest, RotatePlane0_Opt) { TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_width_, benchmark_height_, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, RotatePlane90_Opt) { TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_height_, benchmark_width_, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, RotatePlane180_Opt) { TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_width_, benchmark_height_, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, RotatePlane270_Opt) { TestRotatePlane(benchmark_width_, benchmark_height_, benchmark_height_, benchmark_width_, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane0_Odd) { TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, benchmark_height_ + 1, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane90_Odd) { TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane180_Odd) { TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, benchmark_height_ + 1, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) { TestRotatePlane(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) { int argb_plane_size = benchmark_width_ * 4 * abs(benchmark_height_); align_buffer_page_end(src_argb, argb_plane_size); align_buffer_page_end(dst_argb, argb_plane_size); EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb, benchmark_width_ * 4, benchmark_width_, benchmark_height_, kRotate0)); EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb, benchmark_width_ * 4 - 1, benchmark_width_ - 1, benchmark_height_, kRotate0)); EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb, benchmark_width_ * 4, benchmark_width_, benchmark_height_, kRotate180)); EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb, benchmark_width_ * 4 - 1, benchmark_width_ - 1, benchmark_height_, kRotate180)); EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb, abs(benchmark_height_) * 4, benchmark_width_, benchmark_height_, kRotate90)); EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb, abs(benchmark_height_) * 4, benchmark_width_ - 1, benchmark_height_, kRotate90)); EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb, abs(benchmark_height_) * 4, benchmark_width_, benchmark_height_, kRotate270)); EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb, abs(benchmark_height_) * 4, benchmark_width_ - 1, benchmark_height_, kRotate270)); free_aligned_buffer_page_end(dst_argb); free_aligned_buffer_page_end(src_argb); } } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/rotate_test.cc000066400000000000000000000576271416500237200224310ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include "../unit_test/unit_test.h" #include "libyuv/cpu_id.h" #include "libyuv/rotate.h" namespace libyuv { #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) static void I420TestRotate(int src_width, int src_height, int dst_width, int dst_height, libyuv::RotationMode mode, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (src_width < 1) { src_width = 1; } if (src_height == 0) { src_height = 1; } if (dst_width < 1) { dst_width = 1; } if (dst_height < 1) { dst_height = 1; } int src_i420_y_size = src_width * Abs(src_height); int src_i420_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2); int src_i420_size = src_i420_y_size + src_i420_uv_size * 2; align_buffer_page_end(src_i420, src_i420_size); for (int i = 0; i < src_i420_size; ++i) { src_i420[i] = fastrand() & 0xff; } int dst_i420_y_size = dst_width * dst_height; int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2); int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2; align_buffer_page_end(dst_i420_c, dst_i420_size); align_buffer_page_end(dst_i420_opt, dst_i420_size); memset(dst_i420_c, 2, dst_i420_size); memset(dst_i420_opt, 3, dst_i420_size); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. I420Rotate(src_i420, src_width, src_i420 + src_i420_y_size, (src_width + 1) / 2, src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2, dst_i420_c, dst_width, dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2, dst_i420_c + dst_i420_y_size + dst_i420_uv_size, (dst_width + 1) / 2, src_width, src_height, mode); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations; ++i) { I420Rotate( src_i420, src_width, src_i420 + src_i420_y_size, (src_width + 1) / 2, src_i420 + src_i420_y_size + src_i420_uv_size, (src_width + 1) / 2, dst_i420_opt, dst_width, dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2, dst_i420_opt + dst_i420_y_size + dst_i420_uv_size, (dst_width + 1) / 2, src_width, src_height, mode); } // Rotation should be exact. for (int i = 0; i < dst_i420_size; ++i) { EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]); } free_aligned_buffer_page_end(dst_i420_c); free_aligned_buffer_page_end(dst_i420_opt); free_aligned_buffer_page_end(src_i420); } TEST_F(LibYUVRotateTest, I420Rotate0_Opt) { I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, benchmark_height_, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, I420Rotate90_Opt) { I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, benchmark_width_, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, I420Rotate180_Opt) { I420TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, benchmark_height_, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, I420Rotate270_Opt) { I420TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, benchmark_width_, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } // TODO(fbarchard): Remove odd width tests. // Odd width tests work but disabled because they use C code and can be // tested by passing an odd width command line or environment variable. TEST_F(LibYUVRotateTest, DISABLED_I420Rotate0_Odd) { I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, benchmark_height_ + 1, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I420Rotate90_Odd) { I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I420Rotate180_Odd) { I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, benchmark_height_ + 1, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) { I420TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } static void I444TestRotate(int src_width, int src_height, int dst_width, int dst_height, libyuv::RotationMode mode, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (src_width < 1) { src_width = 1; } if (src_height == 0) { src_height = 1; } if (dst_width < 1) { dst_width = 1; } if (dst_height < 1) { dst_height = 1; } int src_i444_y_size = src_width * Abs(src_height); int src_i444_uv_size = src_width * Abs(src_height); int src_i444_size = src_i444_y_size + src_i444_uv_size * 2; align_buffer_page_end(src_i444, src_i444_size); for (int i = 0; i < src_i444_size; ++i) { src_i444[i] = fastrand() & 0xff; } int dst_i444_y_size = dst_width * dst_height; int dst_i444_uv_size = dst_width * dst_height; int dst_i444_size = dst_i444_y_size + dst_i444_uv_size * 2; align_buffer_page_end(dst_i444_c, dst_i444_size); align_buffer_page_end(dst_i444_opt, dst_i444_size); memset(dst_i444_c, 2, dst_i444_size); memset(dst_i444_opt, 3, dst_i444_size); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width, src_i444 + src_i444_y_size + src_i444_uv_size, src_width, dst_i444_c, dst_width, dst_i444_c + dst_i444_y_size, dst_width, dst_i444_c + dst_i444_y_size + dst_i444_uv_size, dst_width, src_width, src_height, mode); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations; ++i) { I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width, src_i444 + src_i444_y_size + src_i444_uv_size, src_width, dst_i444_opt, dst_width, dst_i444_opt + dst_i444_y_size, dst_width, dst_i444_opt + dst_i444_y_size + dst_i444_uv_size, dst_width, src_width, src_height, mode); } // Rotation should be exact. for (int i = 0; i < dst_i444_size; ++i) { EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]); } free_aligned_buffer_page_end(dst_i444_c); free_aligned_buffer_page_end(dst_i444_opt); free_aligned_buffer_page_end(src_i444); } TEST_F(LibYUVRotateTest, I444Rotate0_Opt) { I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, benchmark_height_, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, I444Rotate90_Opt) { I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, benchmark_width_, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, I444Rotate180_Opt) { I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, benchmark_height_, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, I444Rotate270_Opt) { I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, benchmark_width_, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } // TODO(fbarchard): Remove odd width tests. // Odd width tests work but disabled because they use C code and can be // tested by passing an odd width command line or environment variable. TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) { I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, benchmark_height_ + 1, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) { I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) { I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, benchmark_height_ + 1, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) { I444TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } static void NV12TestRotate(int src_width, int src_height, int dst_width, int dst_height, libyuv::RotationMode mode, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (src_width < 1) { src_width = 1; } if (src_height == 0) { // allow negative for inversion test. src_height = 1; } if (dst_width < 1) { dst_width = 1; } if (dst_height < 1) { dst_height = 1; } int src_nv12_y_size = src_width * Abs(src_height); int src_nv12_uv_size = ((src_width + 1) / 2) * ((Abs(src_height) + 1) / 2) * 2; int src_nv12_size = src_nv12_y_size + src_nv12_uv_size; align_buffer_page_end(src_nv12, src_nv12_size); for (int i = 0; i < src_nv12_size; ++i) { src_nv12[i] = fastrand() & 0xff; } int dst_i420_y_size = dst_width * dst_height; int dst_i420_uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2); int dst_i420_size = dst_i420_y_size + dst_i420_uv_size * 2; align_buffer_page_end(dst_i420_c, dst_i420_size); align_buffer_page_end(dst_i420_opt, dst_i420_size); memset(dst_i420_c, 2, dst_i420_size); memset(dst_i420_opt, 3, dst_i420_size); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size, (src_width + 1) & ~1, dst_i420_c, dst_width, dst_i420_c + dst_i420_y_size, (dst_width + 1) / 2, dst_i420_c + dst_i420_y_size + dst_i420_uv_size, (dst_width + 1) / 2, src_width, src_height, mode); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations; ++i) { NV12ToI420Rotate(src_nv12, src_width, src_nv12 + src_nv12_y_size, (src_width + 1) & ~1, dst_i420_opt, dst_width, dst_i420_opt + dst_i420_y_size, (dst_width + 1) / 2, dst_i420_opt + dst_i420_y_size + dst_i420_uv_size, (dst_width + 1) / 2, src_width, src_height, mode); } // Rotation should be exact. for (int i = 0; i < dst_i420_size; ++i) { EXPECT_EQ(dst_i420_c[i], dst_i420_opt[i]); } free_aligned_buffer_page_end(dst_i420_c); free_aligned_buffer_page_end(dst_i420_opt); free_aligned_buffer_page_end(src_nv12); } TEST_F(LibYUVRotateTest, NV12Rotate0_Opt) { NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, benchmark_height_, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate90_Opt) { NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, benchmark_width_, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate180_Opt) { NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_width_, benchmark_height_, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate270_Opt) { NV12TestRotate(benchmark_width_, benchmark_height_, benchmark_height_, benchmark_width_, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate0_Odd) { NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, benchmark_height_ + 1, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate90_Odd) { NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate180_Odd) { NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, benchmark_height_ + 1, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, DISABLED_NV12Rotate270_Odd) { NV12TestRotate(benchmark_width_ + 1, benchmark_height_ + 1, benchmark_height_ + 1, benchmark_width_ + 1, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate0_Invert) { NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_, benchmark_height_, kRotate0, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate90_Invert) { NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_, benchmark_width_, kRotate90, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate180_Invert) { NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_width_, benchmark_height_, kRotate180, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) { NV12TestRotate(benchmark_width_, -benchmark_height_, benchmark_height_, benchmark_width_, kRotate270, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); } // Test Android 420 to I420 Rotate #define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ W1280, N, NEG, OFF, PN, OFF_U, OFF_V, ROT) \ TEST_F(LibYUVRotateTest, \ SRC_FMT_PLANAR##To##FMT_PLANAR##Rotate##ROT##To##PN##N) { \ const int kWidth = W1280; \ const int kHeight = benchmark_height_; \ const int kSizeUV = \ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ align_buffer_page_end(src_uv, \ kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight); \ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ uint8_t* src_u = src_uv + OFF_U; \ uint8_t* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \ int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE; \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ src_u[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \ (fastrand() & 0xff); \ src_v[(i * src_stride_uv) + j * PIXEL_STRIDE + OFF] = \ (fastrand() & 0xff); \ } \ } \ memset(dst_y_c, 1, kWidth* kHeight); \ memset(dst_u_c, 2, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_v_c, 3, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_y_opt, 101, kWidth* kHeight); \ memset(dst_u_opt, 102, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_v_opt, 103, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR##Rotate( \ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, dst_y_c, \ kWidth, dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \ SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight, \ (libyuv::RotationMode)ROT); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ SRC_FMT_PLANAR##To##FMT_PLANAR##Rotate( \ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), PIXEL_STRIDE, \ dst_y_opt, kWidth, dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), \ dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight, \ (libyuv::RotationMode)ROT); \ } \ for (int i = 0; i < kHeight; ++i) { \ for (int j = 0; j < kWidth; ++j) { \ EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \ } \ } \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ EXPECT_EQ(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ } \ } \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ EXPECT_EQ(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \ dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \ } \ } \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_u_c); \ free_aligned_buffer_page_end(dst_v_c); \ free_aligned_buffer_page_end(dst_y_opt); \ free_aligned_buffer_page_end(dst_u_opt); \ free_aligned_buffer_page_end(dst_v_opt); \ free_aligned_buffer_page_end(src_y); \ free_aligned_buffer_page_end(src_uv); \ } #define TESTAPLANARTOP(SRC_FMT_PLANAR, PN, PIXEL_STRIDE, OFF_U, OFF_V, \ SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, SUBSAMP_X, \ SUBSAMP_Y) \ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_ + 1, \ _Any, +, 0, PN, OFF_U, OFF_V, 0) \ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, \ _Unaligned, +, 2, PN, OFF_U, OFF_V, 0) \ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, \ -, 0, PN, OFF_U, OFF_V, 0) \ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \ 0, PN, OFF_U, OFF_V, 0) \ TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, \ 0, PN, OFF_U, OFF_V, 180) TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2) TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2) TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2) #undef TESTAPLANARTOP #undef TESTAPLANARTOPI } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/scale_argb_test.cc000066400000000000000000000553571416500237200232130ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include "../unit_test/unit_test.h" #include "libyuv/convert_argb.h" #include "libyuv/cpu_id.h" #include "libyuv/scale_argb.h" #include "libyuv/video_common.h" namespace libyuv { #define STRINGIZE(line) #line #define FILELINESTR(file, line) file ":" STRINGIZE(line) #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) // SLOW TESTS are those that are unoptimized C code. // FULL TESTS are optimized but test many variations of the same code. #define ENABLE_FULL_TESTS #endif // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int ARGBTestFilter(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } int i, j; const int b = 0; // 128 to test for padding/stride. int64_t src_argb_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4LL; int src_stride_argb = (b * 2 + Abs(src_width)) * 4; align_buffer_page_end(src_argb, src_argb_plane_size); if (!src_argb) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } MemRandomize(src_argb, src_argb_plane_size); int64_t dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4LL; int dst_stride_argb = (b * 2 + dst_width) * 4; align_buffer_page_end(dst_argb_c, dst_argb_plane_size); align_buffer_page_end(dst_argb_opt, dst_argb_plane_size); if (!dst_argb_c || !dst_argb_opt) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } memset(dst_argb_c, 2, dst_argb_plane_size); memset(dst_argb_opt, 3, dst_argb_plane_size); // Warm up both versions for consistent benchmarks. MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, dst_width, dst_height, f); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, src_width, src_height, dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb, dst_width, dst_height, f); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. double c_time = get_time(); ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, dst_width, dst_height, f); c_time = (get_time() - c_time); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. double opt_time = get_time(); for (i = 0; i < benchmark_iterations; ++i) { ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, src_width, src_height, dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb, dst_width, dst_height, f); } opt_time = (get_time() - opt_time) / benchmark_iterations; // Report performance of C vs OPT printf("filter %d - %8d us C - %8d us OPT\n", f, static_cast(c_time * 1e6), static_cast(opt_time * 1e6)); // C version may be a little off from the optimized. Order of // operations may introduce rounding somewhere. So do a difference // of the buffers and look to see that the max difference isn't // over 2. int max_diff = 0; for (i = b; i < (dst_height + b); ++i) { for (j = b * 4; j < (dst_width + b) * 4; ++j) { int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] - dst_argb_opt[(i * dst_stride_argb) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } } } free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); free_aligned_buffer_page_end(src_argb); return max_diff; } static const int kTileX = 64; static const int kTileY = 64; static int TileARGBScale(const uint8_t* src_argb, int src_stride_argb, int src_width, int src_height, uint8_t* dst_argb, int dst_stride_argb, int dst_width, int dst_height, FilterMode filtering) { for (int y = 0; y < dst_height; y += kTileY) { for (int x = 0; x < dst_width; x += kTileX) { int clip_width = kTileX; if (x + clip_width > dst_width) { clip_width = dst_width - x; } int clip_height = kTileY; if (y + clip_height > dst_height) { clip_height = dst_height - y; } int r = ARGBScaleClip(src_argb, src_stride_argb, src_width, src_height, dst_argb, dst_stride_argb, dst_width, dst_height, x, y, clip_width, clip_height, filtering); if (r) { return r; } } } return 0; } static int ARGBClipTestFilter(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } const int b = 128; int64_t src_argb_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 4; int src_stride_argb = (b * 2 + Abs(src_width)) * 4; align_buffer_page_end(src_argb, src_argb_plane_size); if (!src_argb) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } memset(src_argb, 1, src_argb_plane_size); int64_t dst_argb_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 4; int dst_stride_argb = (b * 2 + dst_width) * 4; int i, j; for (i = b; i < (Abs(src_height) + b); ++i) { for (j = b; j < (Abs(src_width) + b) * 4; ++j) { src_argb[(i * src_stride_argb) + j] = (fastrand() & 0xff); } } align_buffer_page_end(dst_argb_c, dst_argb_plane_size); align_buffer_page_end(dst_argb_opt, dst_argb_plane_size); if (!dst_argb_c || !dst_argb_opt) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } memset(dst_argb_c, 2, dst_argb_plane_size); memset(dst_argb_opt, 3, dst_argb_plane_size); // Do full image, no clipping. double c_time = get_time(); ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, src_width, src_height, dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, dst_width, dst_height, f); c_time = (get_time() - c_time); // Do tiled image, clipping scale to a tile at a time. double opt_time = get_time(); for (i = 0; i < benchmark_iterations; ++i) { TileARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, src_width, src_height, dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb, dst_width, dst_height, f); } opt_time = (get_time() - opt_time) / benchmark_iterations; // Report performance of Full vs Tiled. printf("filter %d - %8d us Full - %8d us Tiled\n", f, static_cast(c_time * 1e6), static_cast(opt_time * 1e6)); // Compare full scaled image vs tiled image. int max_diff = 0; for (i = b; i < (dst_height + b); ++i) { for (j = b * 4; j < (dst_width + b) * 4; ++j) { int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] - dst_argb_opt[(i * dst_stride_argb) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } } } free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); free_aligned_buffer_page_end(src_argb); return max_diff; } // The following adjustments in dimensions ensure the scale factor will be // exactly achieved. #define DX(x, nom, denom) static_cast((Abs(x) / nom) * nom) #define SX(x, nom, denom) static_cast((x / nom) * denom) #define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff) \ TEST_F(LibYUVScaleTest, ARGBScaleDownBy##name##_##filter) { \ int diff = ARGBTestFilter( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, DISABLED_##ARGBScaleDownClipBy##name##_##filter) { \ int diff = ARGBClipTestFilter( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. #ifndef DISABLE_SLOW_TESTS #define TEST_FACTOR(name, nom, denom) \ TEST_FACTOR1(, name, None, nom, denom, 0) \ TEST_FACTOR1(, name, Linear, nom, denom, 3) \ TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \ TEST_FACTOR1(, name, Box, nom, denom, 3) #else #if defined(ENABLE_FULL_TESTS) #define TEST_FACTOR(name, nom, denom) \ TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0) \ TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3) \ TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \ TEST_FACTOR1(DISABLED_, name, Box, nom, denom, 3) #else #define TEST_FACTOR(name, nom, denom) \ TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) #endif #endif TEST_FACTOR(2, 1, 2) TEST_FACTOR(4, 1, 4) #ifndef DISABLE_SLOW_TESTS TEST_FACTOR(8, 1, 8) #endif TEST_FACTOR(3by4, 3, 4) TEST_FACTOR(3by8, 3, 8) TEST_FACTOR(3, 1, 3) #undef TEST_FACTOR1 #undef TEST_FACTOR #undef SX #undef DX #define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff) \ TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \ int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, width, \ height, kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \ int diff = ARGBTestFilter(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ DISABLED_##name##ClipTo##width##x##height##_##filter) { \ int diff = \ ARGBClipTestFilter(benchmark_width_, benchmark_height_, width, height, \ kFilter##filter, benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ DISABLED_##name##ClipFrom##width##x##height##_##filter) { \ int diff = ARGBClipTestFilter(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_); \ EXPECT_LE(diff, max_diff); \ } #ifndef DISABLE_SLOW_TESTS // Test scale to a specified size with all 4 filters. #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(, name, width, height, None, 0) \ TEST_SCALETO1(, name, width, height, Linear, 3) \ TEST_SCALETO1(, name, width, height, Bilinear, 3) #else #if defined(ENABLE_FULL_TESTS) #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \ TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) #else #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) #endif #endif TEST_SCALETO(ARGBScale, 1, 1) // TEST_SCALETO(ARGBScale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(ARGBScale, 320, 240) TEST_SCALETO(ARGBScale, 569, 480) TEST_SCALETO(ARGBScale, 640, 360) #ifndef DISABLE_SLOW_TESTS TEST_SCALETO(ARGBScale, 1280, 720) TEST_SCALETO(ARGBScale, 1920, 1080) #endif // DISABLE_SLOW_TESTS #undef TEST_SCALETO1 #undef TEST_SCALETO #define TEST_SCALESWAPXY1(name, filter, max_diff) \ TEST_F(LibYUVScaleTest, name##SwapXY_##filter) { \ int diff = ARGBTestFilter(benchmark_width_, benchmark_height_, \ benchmark_height_, benchmark_width_, \ kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } #if defined(ENABLE_FULL_TESTS) // Test scale with swapped width and height with all 3 filters. TEST_SCALESWAPXY1(ARGBScale, None, 0) TEST_SCALESWAPXY1(ARGBScale, Linear, 0) TEST_SCALESWAPXY1(ARGBScale, Bilinear, 0) #else TEST_SCALESWAPXY1(ARGBScale, Bilinear, 0) #endif #undef TEST_SCALESWAPXY1 // Scale with YUV conversion to ARGB and clipping. // TODO(fbarchard): Add fourcc support. All 4 ARGB formats is easy to support. LIBYUV_API int YUVToARGBScaleReference2(const uint8_t* src_y, int src_stride_y, const uint8_t* src_u, int src_stride_u, const uint8_t* src_v, int src_stride_v, uint32_t /* src_fourcc */, int src_width, int src_height, uint8_t* dst_argb, int dst_stride_argb, uint32_t /* dst_fourcc */, int dst_width, int dst_height, int clip_x, int clip_y, int clip_width, int clip_height, enum FilterMode filtering) { uint8_t* argb_buffer = static_cast(malloc(src_width * src_height * 4)); int r; I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, argb_buffer, src_width * 4, src_width, src_height); r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb, dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width, clip_height, filtering); free(argb_buffer); return r; } static void FillRamp(uint8_t* buf, int width, int height, int v, int dx, int dy) { int rv = v; for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { *buf++ = v; v += dx; if (v < 0 || v > 255) { dx = -dx; v += dx; } } v = rv + dy; if (v < 0 || v > 255) { dy = -dy; v += dy; } rv = v; } } // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int YUVToARGBTestFilter(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations) { int64_t src_y_plane_size = Abs(src_width) * Abs(src_height); int64_t src_uv_plane_size = ((Abs(src_width) + 1) / 2) * ((Abs(src_height) + 1) / 2); int src_stride_y = Abs(src_width); int src_stride_uv = (Abs(src_width) + 1) / 2; align_buffer_page_end(src_y, src_y_plane_size); align_buffer_page_end(src_u, src_uv_plane_size); align_buffer_page_end(src_v, src_uv_plane_size); int64_t dst_argb_plane_size = (dst_width) * (dst_height)*4LL; int dst_stride_argb = (dst_width)*4; align_buffer_page_end(dst_argb_c, dst_argb_plane_size); align_buffer_page_end(dst_argb_opt, dst_argb_plane_size); if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } // Fill YUV image with continuous ramp, which is less sensitive to // subsampling and filtering differences for test purposes. FillRamp(src_y, Abs(src_width), Abs(src_height), 128, 1, 1); FillRamp(src_u, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 3, 1, 1); FillRamp(src_v, (Abs(src_width) + 1) / 2, (Abs(src_height) + 1) / 2, 4, 1, 1); memset(dst_argb_c, 2, dst_argb_plane_size); memset(dst_argb_opt, 3, dst_argb_plane_size); YUVToARGBScaleReference2(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, libyuv::FOURCC_I420, src_width, src_height, dst_argb_c, dst_stride_argb, libyuv::FOURCC_I420, dst_width, dst_height, 0, 0, dst_width, dst_height, f); for (int i = 0; i < benchmark_iterations; ++i) { YUVToARGBScaleClip(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, libyuv::FOURCC_I420, src_width, src_height, dst_argb_opt, dst_stride_argb, libyuv::FOURCC_I420, dst_width, dst_height, 0, 0, dst_width, dst_height, f); } int max_diff = 0; for (int i = 0; i < dst_height; ++i) { for (int j = 0; j < dst_width * 4; ++j) { int abs_diff = Abs(dst_argb_c[(i * dst_stride_argb) + j] - dst_argb_opt[(i * dst_stride_argb) + j]); if (abs_diff > max_diff) { printf("error %d at %d,%d c %d opt %d", abs_diff, j, i, dst_argb_c[(i * dst_stride_argb) + j], dst_argb_opt[(i * dst_stride_argb) + j]); EXPECT_LE(abs_diff, 40); max_diff = abs_diff; } } } free_aligned_buffer_page_end(dst_argb_c); free_aligned_buffer_page_end(dst_argb_opt); free_aligned_buffer_page_end(src_y); free_aligned_buffer_page_end(src_u); free_aligned_buffer_page_end(src_v); return max_diff; } TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) { int diff = YUVToARGBTestFilter(benchmark_width_, benchmark_height_, benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, libyuv::kFilterBilinear, benchmark_iterations_); EXPECT_LE(diff, 10); } TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) { int diff = YUVToARGBTestFilter( benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, benchmark_width_, benchmark_height_, libyuv::kFilterBilinear, benchmark_iterations_); EXPECT_LE(diff, 10); } TEST_F(LibYUVScaleTest, ARGBTest3x) { const int kSrcStride = 480 * 4; const int kDstStride = 160 * 4; const int kSize = kSrcStride * 3; align_buffer_page_end(orig_pixels, kSize); for (int i = 0; i < 480 * 3; ++i) { orig_pixels[i * 4 + 0] = i; orig_pixels[i * 4 + 1] = 255 - i; orig_pixels[i * 4 + 2] = i + 1; orig_pixels[i * 4 + 3] = i + 10; } align_buffer_page_end(dest_pixels, kDstStride); int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { ARGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1, kFilterBilinear); } EXPECT_EQ(225, dest_pixels[0]); EXPECT_EQ(255 - 225, dest_pixels[1]); EXPECT_EQ(226, dest_pixels[2]); EXPECT_EQ(235, dest_pixels[3]); ARGBScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1, kFilterNone); EXPECT_EQ(225, dest_pixels[0]); EXPECT_EQ(255 - 225, dest_pixels[1]); EXPECT_EQ(226, dest_pixels[2]); EXPECT_EQ(235, dest_pixels[3]); free_aligned_buffer_page_end(dest_pixels); free_aligned_buffer_page_end(orig_pixels); } TEST_F(LibYUVScaleTest, ARGBTest4x) { const int kSrcStride = 640 * 4; const int kDstStride = 160 * 4; const int kSize = kSrcStride * 4; align_buffer_page_end(orig_pixels, kSize); for (int i = 0; i < 640 * 4; ++i) { orig_pixels[i * 4 + 0] = i; orig_pixels[i * 4 + 1] = 255 - i; orig_pixels[i * 4 + 2] = i + 1; orig_pixels[i * 4 + 3] = i + 10; } align_buffer_page_end(dest_pixels, kDstStride); int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { ARGBScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1, kFilterBilinear); } EXPECT_NEAR(66, dest_pixels[0], 4); EXPECT_NEAR(255 - 66, dest_pixels[1], 4); EXPECT_NEAR(67, dest_pixels[2], 4); EXPECT_NEAR(76, dest_pixels[3], 4); ARGBScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1, kFilterNone); EXPECT_EQ(2, dest_pixels[0]); EXPECT_EQ(255 - 2, dest_pixels[1]); EXPECT_EQ(3, dest_pixels[2]); EXPECT_EQ(12, dest_pixels[3]); free_aligned_buffer_page_end(dest_pixels); free_aligned_buffer_page_end(orig_pixels); } } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/scale_test.cc000066400000000000000000001760771416500237200222230ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include "../unit_test/unit_test.h" #include "libyuv/cpu_id.h" #include "libyuv/scale.h" #ifdef ENABLE_ROW_TESTS #include "libyuv/scale_row.h" // For ScaleRowDown2Box_Odd_C #endif #define STRINGIZE(line) #line #define FILELINESTR(file, line) file ":" STRINGIZE(line) #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) // SLOW TESTS are those that are unoptimized C code. // FULL TESTS are optimized but test many variations of the same code. #define ENABLE_FULL_TESTS #endif namespace libyuv { // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int I420TestFilter(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } int i, j; int src_width_uv = (Abs(src_width) + 1) >> 1; int src_height_uv = (Abs(src_height) + 1) >> 1; int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv); int src_stride_y = Abs(src_width); int src_stride_uv = src_width_uv; align_buffer_page_end(src_y, src_y_plane_size); align_buffer_page_end(src_u, src_uv_plane_size); align_buffer_page_end(src_v, src_uv_plane_size); if (!src_y || !src_u || !src_v) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } MemRandomize(src_y, src_y_plane_size); MemRandomize(src_u, src_uv_plane_size); MemRandomize(src_v, src_uv_plane_size); int dst_width_uv = (dst_width + 1) >> 1; int dst_height_uv = (dst_height + 1) >> 1; int64_t dst_y_plane_size = (dst_width) * (dst_height); int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); int dst_stride_y = dst_width; int dst_stride_uv = dst_width_uv; align_buffer_page_end(dst_y_c, dst_y_plane_size); align_buffer_page_end(dst_u_c, dst_uv_plane_size); align_buffer_page_end(dst_v_c, dst_uv_plane_size); align_buffer_page_end(dst_y_opt, dst_y_plane_size); align_buffer_page_end(dst_u_opt, dst_uv_plane_size); align_buffer_page_end(dst_v_opt, dst_uv_plane_size); if (!dst_y_c || !dst_u_c || !dst_v_c || !dst_y_opt || !dst_u_opt || !dst_v_opt) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. double c_time = get_time(); I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, src_width, src_height, dst_y_c, dst_stride_y, dst_u_c, dst_stride_uv, dst_v_c, dst_stride_uv, dst_width, dst_height, f); c_time = (get_time() - c_time); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. double opt_time = get_time(); for (i = 0; i < benchmark_iterations; ++i) { I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, src_width, src_height, dst_y_opt, dst_stride_y, dst_u_opt, dst_stride_uv, dst_v_opt, dst_stride_uv, dst_width, dst_height, f); } opt_time = (get_time() - opt_time) / benchmark_iterations; // Report performance of C vs OPT. printf("filter %d - %8d us C - %8d us OPT\n", f, static_cast(c_time * 1e6), static_cast(opt_time * 1e6)); // C version may be a little off from the optimized. Order of // operations may introduce rounding somewhere. So do a difference // of the buffers and look to see that the max difference is not // over 3. int max_diff = 0; for (i = 0; i < (dst_height); ++i) { for (j = 0; j < (dst_width); ++j) { int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] - dst_y_opt[(i * dst_stride_y) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } } } for (i = 0; i < (dst_height_uv); ++i) { for (j = 0; j < (dst_width_uv); ++j) { int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] - dst_u_opt[(i * dst_stride_uv) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] - dst_v_opt[(i * dst_stride_uv) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } } } free_aligned_buffer_page_end(dst_y_c); free_aligned_buffer_page_end(dst_u_c); free_aligned_buffer_page_end(dst_v_c); free_aligned_buffer_page_end(dst_y_opt); free_aligned_buffer_page_end(dst_u_opt); free_aligned_buffer_page_end(dst_v_opt); free_aligned_buffer_page_end(src_y); free_aligned_buffer_page_end(src_u); free_aligned_buffer_page_end(src_v); return max_diff; } // Test scaling with 8 bit C vs 12 bit C and return maximum pixel difference. // 0 = exact. static int I420TestFilter_12(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } int i; int src_width_uv = (Abs(src_width) + 1) >> 1; int src_height_uv = (Abs(src_height) + 1) >> 1; int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv); int src_stride_y = Abs(src_width); int src_stride_uv = src_width_uv; align_buffer_page_end(src_y, src_y_plane_size); align_buffer_page_end(src_u, src_uv_plane_size); align_buffer_page_end(src_v, src_uv_plane_size); align_buffer_page_end(src_y_12, src_y_plane_size * 2); align_buffer_page_end(src_u_12, src_uv_plane_size * 2); align_buffer_page_end(src_v_12, src_uv_plane_size * 2); if (!src_y || !src_u || !src_v || !src_y_12 || !src_u_12 || !src_v_12) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } uint16_t* p_src_y_12 = reinterpret_cast(src_y_12); uint16_t* p_src_u_12 = reinterpret_cast(src_u_12); uint16_t* p_src_v_12 = reinterpret_cast(src_v_12); MemRandomize(src_y, src_y_plane_size); MemRandomize(src_u, src_uv_plane_size); MemRandomize(src_v, src_uv_plane_size); for (i = 0; i < src_y_plane_size; ++i) { p_src_y_12[i] = src_y[i]; } for (i = 0; i < src_uv_plane_size; ++i) { p_src_u_12[i] = src_u[i]; p_src_v_12[i] = src_v[i]; } int dst_width_uv = (dst_width + 1) >> 1; int dst_height_uv = (dst_height + 1) >> 1; int dst_y_plane_size = (dst_width) * (dst_height); int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); int dst_stride_y = dst_width; int dst_stride_uv = dst_width_uv; align_buffer_page_end(dst_y_8, dst_y_plane_size); align_buffer_page_end(dst_u_8, dst_uv_plane_size); align_buffer_page_end(dst_v_8, dst_uv_plane_size); align_buffer_page_end(dst_y_12, dst_y_plane_size * 2); align_buffer_page_end(dst_u_12, dst_uv_plane_size * 2); align_buffer_page_end(dst_v_12, dst_uv_plane_size * 2); uint16_t* p_dst_y_12 = reinterpret_cast(dst_y_12); uint16_t* p_dst_u_12 = reinterpret_cast(dst_u_12); uint16_t* p_dst_v_12 = reinterpret_cast(dst_v_12); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, src_width, src_height, dst_y_8, dst_stride_y, dst_u_8, dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (i = 0; i < benchmark_iterations; ++i) { I420Scale_12(p_src_y_12, src_stride_y, p_src_u_12, src_stride_uv, p_src_v_12, src_stride_uv, src_width, src_height, p_dst_y_12, dst_stride_y, p_dst_u_12, dst_stride_uv, p_dst_v_12, dst_stride_uv, dst_width, dst_height, f); } // Expect an exact match. int max_diff = 0; for (i = 0; i < dst_y_plane_size; ++i) { int abs_diff = Abs(dst_y_8[i] - p_dst_y_12[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } } for (i = 0; i < dst_uv_plane_size; ++i) { int abs_diff = Abs(dst_u_8[i] - p_dst_u_12[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } abs_diff = Abs(dst_v_8[i] - p_dst_v_12[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(dst_y_8); free_aligned_buffer_page_end(dst_u_8); free_aligned_buffer_page_end(dst_v_8); free_aligned_buffer_page_end(dst_y_12); free_aligned_buffer_page_end(dst_u_12); free_aligned_buffer_page_end(dst_v_12); free_aligned_buffer_page_end(src_y); free_aligned_buffer_page_end(src_u); free_aligned_buffer_page_end(src_v); free_aligned_buffer_page_end(src_y_12); free_aligned_buffer_page_end(src_u_12); free_aligned_buffer_page_end(src_v_12); return max_diff; } // Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference. // 0 = exact. static int I420TestFilter_16(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } int i; int src_width_uv = (Abs(src_width) + 1) >> 1; int src_height_uv = (Abs(src_height) + 1) >> 1; int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv); int src_stride_y = Abs(src_width); int src_stride_uv = src_width_uv; align_buffer_page_end(src_y, src_y_plane_size); align_buffer_page_end(src_u, src_uv_plane_size); align_buffer_page_end(src_v, src_uv_plane_size); align_buffer_page_end(src_y_16, src_y_plane_size * 2); align_buffer_page_end(src_u_16, src_uv_plane_size * 2); align_buffer_page_end(src_v_16, src_uv_plane_size * 2); if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } uint16_t* p_src_y_16 = reinterpret_cast(src_y_16); uint16_t* p_src_u_16 = reinterpret_cast(src_u_16); uint16_t* p_src_v_16 = reinterpret_cast(src_v_16); MemRandomize(src_y, src_y_plane_size); MemRandomize(src_u, src_uv_plane_size); MemRandomize(src_v, src_uv_plane_size); for (i = 0; i < src_y_plane_size; ++i) { p_src_y_16[i] = src_y[i]; } for (i = 0; i < src_uv_plane_size; ++i) { p_src_u_16[i] = src_u[i]; p_src_v_16[i] = src_v[i]; } int dst_width_uv = (dst_width + 1) >> 1; int dst_height_uv = (dst_height + 1) >> 1; int dst_y_plane_size = (dst_width) * (dst_height); int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); int dst_stride_y = dst_width; int dst_stride_uv = dst_width_uv; align_buffer_page_end(dst_y_8, dst_y_plane_size); align_buffer_page_end(dst_u_8, dst_uv_plane_size); align_buffer_page_end(dst_v_8, dst_uv_plane_size); align_buffer_page_end(dst_y_16, dst_y_plane_size * 2); align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2); align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2); uint16_t* p_dst_y_16 = reinterpret_cast(dst_y_16); uint16_t* p_dst_u_16 = reinterpret_cast(dst_u_16); uint16_t* p_dst_v_16 = reinterpret_cast(dst_v_16); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. I420Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, src_width, src_height, dst_y_8, dst_stride_y, dst_u_8, dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (i = 0; i < benchmark_iterations; ++i) { I420Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv, p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16, dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16, dst_stride_uv, dst_width, dst_height, f); } // Expect an exact match. int max_diff = 0; for (i = 0; i < dst_y_plane_size; ++i) { int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } } for (i = 0; i < dst_uv_plane_size; ++i) { int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(dst_y_8); free_aligned_buffer_page_end(dst_u_8); free_aligned_buffer_page_end(dst_v_8); free_aligned_buffer_page_end(dst_y_16); free_aligned_buffer_page_end(dst_u_16); free_aligned_buffer_page_end(dst_v_16); free_aligned_buffer_page_end(src_y); free_aligned_buffer_page_end(src_u); free_aligned_buffer_page_end(src_v); free_aligned_buffer_page_end(src_y_16); free_aligned_buffer_page_end(src_u_16); free_aligned_buffer_page_end(src_v_16); return max_diff; } // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int I444TestFilter(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } int i, j; int src_width_uv = Abs(src_width); int src_height_uv = Abs(src_height); int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv); int src_stride_y = Abs(src_width); int src_stride_uv = src_width_uv; align_buffer_page_end(src_y, src_y_plane_size); align_buffer_page_end(src_u, src_uv_plane_size); align_buffer_page_end(src_v, src_uv_plane_size); if (!src_y || !src_u || !src_v) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } MemRandomize(src_y, src_y_plane_size); MemRandomize(src_u, src_uv_plane_size); MemRandomize(src_v, src_uv_plane_size); int dst_width_uv = dst_width; int dst_height_uv = dst_height; int64_t dst_y_plane_size = (dst_width) * (dst_height); int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); int dst_stride_y = dst_width; int dst_stride_uv = dst_width_uv; align_buffer_page_end(dst_y_c, dst_y_plane_size); align_buffer_page_end(dst_u_c, dst_uv_plane_size); align_buffer_page_end(dst_v_c, dst_uv_plane_size); align_buffer_page_end(dst_y_opt, dst_y_plane_size); align_buffer_page_end(dst_u_opt, dst_uv_plane_size); align_buffer_page_end(dst_v_opt, dst_uv_plane_size); if (!dst_y_c || !dst_u_c || !dst_v_c || !dst_y_opt || !dst_u_opt || !dst_v_opt) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. double c_time = get_time(); I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, src_width, src_height, dst_y_c, dst_stride_y, dst_u_c, dst_stride_uv, dst_v_c, dst_stride_uv, dst_width, dst_height, f); c_time = (get_time() - c_time); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. double opt_time = get_time(); for (i = 0; i < benchmark_iterations; ++i) { I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, src_width, src_height, dst_y_opt, dst_stride_y, dst_u_opt, dst_stride_uv, dst_v_opt, dst_stride_uv, dst_width, dst_height, f); } opt_time = (get_time() - opt_time) / benchmark_iterations; // Report performance of C vs OPT. printf("filter %d - %8d us C - %8d us OPT\n", f, static_cast(c_time * 1e6), static_cast(opt_time * 1e6)); // C version may be a little off from the optimized. Order of // operations may introduce rounding somewhere. So do a difference // of the buffers and look to see that the max difference is not // over 3. int max_diff = 0; for (i = 0; i < (dst_height); ++i) { for (j = 0; j < (dst_width); ++j) { int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] - dst_y_opt[(i * dst_stride_y) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } } } for (i = 0; i < (dst_height_uv); ++i) { for (j = 0; j < (dst_width_uv); ++j) { int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] - dst_u_opt[(i * dst_stride_uv) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] - dst_v_opt[(i * dst_stride_uv) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } } } free_aligned_buffer_page_end(dst_y_c); free_aligned_buffer_page_end(dst_u_c); free_aligned_buffer_page_end(dst_v_c); free_aligned_buffer_page_end(dst_y_opt); free_aligned_buffer_page_end(dst_u_opt); free_aligned_buffer_page_end(dst_v_opt); free_aligned_buffer_page_end(src_y); free_aligned_buffer_page_end(src_u); free_aligned_buffer_page_end(src_v); return max_diff; } // Test scaling with 8 bit C vs 12 bit C and return maximum pixel difference. // 0 = exact. static int I444TestFilter_12(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } int i; int src_width_uv = Abs(src_width); int src_height_uv = Abs(src_height); int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv); int src_stride_y = Abs(src_width); int src_stride_uv = src_width_uv; align_buffer_page_end(src_y, src_y_plane_size); align_buffer_page_end(src_u, src_uv_plane_size); align_buffer_page_end(src_v, src_uv_plane_size); align_buffer_page_end(src_y_12, src_y_plane_size * 2); align_buffer_page_end(src_u_12, src_uv_plane_size * 2); align_buffer_page_end(src_v_12, src_uv_plane_size * 2); if (!src_y || !src_u || !src_v || !src_y_12 || !src_u_12 || !src_v_12) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } uint16_t* p_src_y_12 = reinterpret_cast(src_y_12); uint16_t* p_src_u_12 = reinterpret_cast(src_u_12); uint16_t* p_src_v_12 = reinterpret_cast(src_v_12); MemRandomize(src_y, src_y_plane_size); MemRandomize(src_u, src_uv_plane_size); MemRandomize(src_v, src_uv_plane_size); for (i = 0; i < src_y_plane_size; ++i) { p_src_y_12[i] = src_y[i]; } for (i = 0; i < src_uv_plane_size; ++i) { p_src_u_12[i] = src_u[i]; p_src_v_12[i] = src_v[i]; } int dst_width_uv = dst_width; int dst_height_uv = dst_height; int dst_y_plane_size = (dst_width) * (dst_height); int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); int dst_stride_y = dst_width; int dst_stride_uv = dst_width_uv; align_buffer_page_end(dst_y_8, dst_y_plane_size); align_buffer_page_end(dst_u_8, dst_uv_plane_size); align_buffer_page_end(dst_v_8, dst_uv_plane_size); align_buffer_page_end(dst_y_12, dst_y_plane_size * 2); align_buffer_page_end(dst_u_12, dst_uv_plane_size * 2); align_buffer_page_end(dst_v_12, dst_uv_plane_size * 2); uint16_t* p_dst_y_12 = reinterpret_cast(dst_y_12); uint16_t* p_dst_u_12 = reinterpret_cast(dst_u_12); uint16_t* p_dst_v_12 = reinterpret_cast(dst_v_12); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, src_width, src_height, dst_y_8, dst_stride_y, dst_u_8, dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (i = 0; i < benchmark_iterations; ++i) { I444Scale_12(p_src_y_12, src_stride_y, p_src_u_12, src_stride_uv, p_src_v_12, src_stride_uv, src_width, src_height, p_dst_y_12, dst_stride_y, p_dst_u_12, dst_stride_uv, p_dst_v_12, dst_stride_uv, dst_width, dst_height, f); } // Expect an exact match. int max_diff = 0; for (i = 0; i < dst_y_plane_size; ++i) { int abs_diff = Abs(dst_y_8[i] - p_dst_y_12[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } } for (i = 0; i < dst_uv_plane_size; ++i) { int abs_diff = Abs(dst_u_8[i] - p_dst_u_12[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } abs_diff = Abs(dst_v_8[i] - p_dst_v_12[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(dst_y_8); free_aligned_buffer_page_end(dst_u_8); free_aligned_buffer_page_end(dst_v_8); free_aligned_buffer_page_end(dst_y_12); free_aligned_buffer_page_end(dst_u_12); free_aligned_buffer_page_end(dst_v_12); free_aligned_buffer_page_end(src_y); free_aligned_buffer_page_end(src_u); free_aligned_buffer_page_end(src_v); free_aligned_buffer_page_end(src_y_12); free_aligned_buffer_page_end(src_u_12); free_aligned_buffer_page_end(src_v_12); return max_diff; } // Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference. // 0 = exact. static int I444TestFilter_16(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } int i; int src_width_uv = Abs(src_width); int src_height_uv = Abs(src_height); int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv); int src_stride_y = Abs(src_width); int src_stride_uv = src_width_uv; align_buffer_page_end(src_y, src_y_plane_size); align_buffer_page_end(src_u, src_uv_plane_size); align_buffer_page_end(src_v, src_uv_plane_size); align_buffer_page_end(src_y_16, src_y_plane_size * 2); align_buffer_page_end(src_u_16, src_uv_plane_size * 2); align_buffer_page_end(src_v_16, src_uv_plane_size * 2); if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } uint16_t* p_src_y_16 = reinterpret_cast(src_y_16); uint16_t* p_src_u_16 = reinterpret_cast(src_u_16); uint16_t* p_src_v_16 = reinterpret_cast(src_v_16); MemRandomize(src_y, src_y_plane_size); MemRandomize(src_u, src_uv_plane_size); MemRandomize(src_v, src_uv_plane_size); for (i = 0; i < src_y_plane_size; ++i) { p_src_y_16[i] = src_y[i]; } for (i = 0; i < src_uv_plane_size; ++i) { p_src_u_16[i] = src_u[i]; p_src_v_16[i] = src_v[i]; } int dst_width_uv = dst_width; int dst_height_uv = dst_height; int dst_y_plane_size = (dst_width) * (dst_height); int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv); int dst_stride_y = dst_width; int dst_stride_uv = dst_width_uv; align_buffer_page_end(dst_y_8, dst_y_plane_size); align_buffer_page_end(dst_u_8, dst_uv_plane_size); align_buffer_page_end(dst_v_8, dst_uv_plane_size); align_buffer_page_end(dst_y_16, dst_y_plane_size * 2); align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2); align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2); uint16_t* p_dst_y_16 = reinterpret_cast(dst_y_16); uint16_t* p_dst_u_16 = reinterpret_cast(dst_u_16); uint16_t* p_dst_v_16 = reinterpret_cast(dst_v_16); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv, src_width, src_height, dst_y_8, dst_stride_y, dst_u_8, dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (i = 0; i < benchmark_iterations; ++i) { I444Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv, p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16, dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16, dst_stride_uv, dst_width, dst_height, f); } // Expect an exact match. int max_diff = 0; for (i = 0; i < dst_y_plane_size; ++i) { int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } } for (i = 0; i < dst_uv_plane_size; ++i) { int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(dst_y_8); free_aligned_buffer_page_end(dst_u_8); free_aligned_buffer_page_end(dst_v_8); free_aligned_buffer_page_end(dst_y_16); free_aligned_buffer_page_end(dst_u_16); free_aligned_buffer_page_end(dst_v_16); free_aligned_buffer_page_end(src_y); free_aligned_buffer_page_end(src_u); free_aligned_buffer_page_end(src_v); free_aligned_buffer_page_end(src_y_16); free_aligned_buffer_page_end(src_u_16); free_aligned_buffer_page_end(src_v_16); return max_diff; } // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int NV12TestFilter(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } int i, j; int src_width_uv = (Abs(src_width) + 1) >> 1; int src_height_uv = (Abs(src_height) + 1) >> 1; int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv)*2; int src_stride_y = Abs(src_width); int src_stride_uv = src_width_uv * 2; align_buffer_page_end(src_y, src_y_plane_size); align_buffer_page_end(src_uv, src_uv_plane_size); if (!src_y || !src_uv) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } MemRandomize(src_y, src_y_plane_size); MemRandomize(src_uv, src_uv_plane_size); int dst_width_uv = (dst_width + 1) >> 1; int dst_height_uv = (dst_height + 1) >> 1; int64_t dst_y_plane_size = (dst_width) * (dst_height); int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv)*2; int dst_stride_y = dst_width; int dst_stride_uv = dst_width_uv * 2; align_buffer_page_end(dst_y_c, dst_y_plane_size); align_buffer_page_end(dst_uv_c, dst_uv_plane_size); align_buffer_page_end(dst_y_opt, dst_y_plane_size); align_buffer_page_end(dst_uv_opt, dst_uv_plane_size); if (!dst_y_c || !dst_uv_c || !dst_y_opt || !dst_uv_opt) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. double c_time = get_time(); NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height, dst_y_c, dst_stride_y, dst_uv_c, dst_stride_uv, dst_width, dst_height, f); c_time = (get_time() - c_time); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. double opt_time = get_time(); for (i = 0; i < benchmark_iterations; ++i) { NV12Scale(src_y, src_stride_y, src_uv, src_stride_uv, src_width, src_height, dst_y_opt, dst_stride_y, dst_uv_opt, dst_stride_uv, dst_width, dst_height, f); } opt_time = (get_time() - opt_time) / benchmark_iterations; // Report performance of C vs OPT. printf("filter %d - %8d us C - %8d us OPT\n", f, static_cast(c_time * 1e6), static_cast(opt_time * 1e6)); // C version may be a little off from the optimized. Order of // operations may introduce rounding somewhere. So do a difference // of the buffers and look to see that the max difference is not // over 3. int max_diff = 0; for (i = 0; i < (dst_height); ++i) { for (j = 0; j < (dst_width); ++j) { int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] - dst_y_opt[(i * dst_stride_y) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } } } for (i = 0; i < (dst_height_uv); ++i) { for (j = 0; j < (dst_width_uv * 2); ++j) { int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] - dst_uv_opt[(i * dst_stride_uv) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } } } free_aligned_buffer_page_end(dst_y_c); free_aligned_buffer_page_end(dst_uv_c); free_aligned_buffer_page_end(dst_y_opt); free_aligned_buffer_page_end(dst_uv_opt); free_aligned_buffer_page_end(src_y); free_aligned_buffer_page_end(src_uv); return max_diff; } // The following adjustments in dimensions ensure the scale factor will be // exactly achieved. // 2 is chroma subsample. #define DX(x, nom, denom) static_cast(((Abs(x) / nom + 1) / 2) * nom * 2) #define SX(x, nom, denom) static_cast(((x / nom + 1) / 2) * denom * 2) #define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff) \ TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) { \ int diff = I420TestFilter( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) { \ int diff = I444TestFilter( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, DISABLED_##I420ScaleDownBy##name##_##filter##_12) { \ int diff = I420TestFilter_12( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, DISABLED_##I444ScaleDownBy##name##_##filter##_12) { \ int diff = I444TestFilter_12( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, NV12ScaleDownBy##name##_##filter) { \ int diff = NV12TestFilter( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. #ifndef DISABLE_SLOW_TESTS #define TEST_FACTOR(name, nom, denom, boxdiff) \ TEST_FACTOR1(, name, None, nom, denom, 0) \ TEST_FACTOR1(, name, Linear, nom, denom, 3) \ TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \ TEST_FACTOR1(, name, Box, nom, denom, boxdiff) #else #if defined(ENABLE_FULL_TESTS) #define TEST_FACTOR(name, nom, denom, boxdiff) \ TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0) \ TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3) \ TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \ TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff) #else #define TEST_FACTOR(name, nom, denom, boxdiff) \ TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \ TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff) #endif #endif TEST_FACTOR(2, 1, 2, 0) TEST_FACTOR(4, 1, 4, 0) #ifndef DISABLE_SLOW_TESTS TEST_FACTOR(8, 1, 8, 0) #endif TEST_FACTOR(3by4, 3, 4, 1) TEST_FACTOR(3by8, 3, 8, 1) TEST_FACTOR(3, 1, 3, 0) #undef TEST_FACTOR1 #undef TEST_FACTOR #undef SX #undef DX #define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff) \ TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter) { \ int diff = I420TestFilter(benchmark_width_, benchmark_height_, width, \ height, kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter) { \ int diff = I444TestFilter(benchmark_width_, benchmark_height_, width, \ height, kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ DISABLED_##I420##name##To##width##x##height##_##filter##_12) { \ int diff = I420TestFilter_12( \ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ DISABLED_##I444##name##To##width##x##height##_##filter##_12) { \ int diff = I444TestFilter_12( \ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ DISABLED_##I420##name##To##width##x##height##_##filter##_16) { \ int diff = I420TestFilter_16( \ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ DISABLED_##I444##name##To##width##x##height##_##filter##_16) { \ int diff = I444TestFilter_16( \ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, NV12##name##To##width##x##height##_##filter) { \ int diff = NV12TestFilter(benchmark_width_, benchmark_height_, width, \ height, kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, I420##name##From##width##x##height##_##filter) { \ int diff = I420TestFilter(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, I444##name##From##width##x##height##_##filter) { \ int diff = I444TestFilter(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ DISABLED_##I420##name##From##width##x##height##_##filter##_12) { \ int diff = I420TestFilter_12(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ DISABLED_##I444##name##From##width##x##height##_##filter##_12) { \ int diff = I444TestFilter_12(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ DISABLED_##I420##name##From##width##x##height##_##filter##_16) { \ int diff = I420TestFilter_16(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ DISABLED_##I444##name##From##width##x##height##_##filter##_16) { \ int diff = I444TestFilter_16(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, NV12##name##From##width##x##height##_##filter) { \ int diff = NV12TestFilter(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } #ifndef DISABLE_SLOW_TESTS // Test scale to a specified size with all 4 filters. #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(, name, width, height, None, 0) \ TEST_SCALETO1(, name, width, height, Linear, 3) \ TEST_SCALETO1(, name, width, height, Bilinear, 3) \ TEST_SCALETO1(, name, width, height, Box, 3) #else #if defined(ENABLE_FULL_TESTS) #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \ TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \ TEST_SCALETO1(DISABLED_, name, width, height, Box, 3) #else #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \ TEST_SCALETO1(DISABLED_, name, width, height, Box, 3) #endif #endif TEST_SCALETO(Scale, 1, 1) // TEST_SCALETO(Scale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(Scale, 320, 240) TEST_SCALETO(Scale, 569, 480) TEST_SCALETO(Scale, 640, 360) #ifndef DISABLE_SLOW_TESTS TEST_SCALETO(Scale, 1280, 720) TEST_SCALETO(Scale, 1920, 1080) #endif // DISABLE_SLOW_TESTS #undef TEST_SCALETO1 #undef TEST_SCALETO #define TEST_SCALESWAPXY1(DISABLED_, name, filter, max_diff) \ TEST_F(LibYUVScaleTest, I420##name##SwapXY_##filter) { \ int diff = I420TestFilter(benchmark_width_, benchmark_height_, \ benchmark_height_, benchmark_width_, \ kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, I444##name##SwapXY_##filter) { \ int diff = I444TestFilter(benchmark_width_, benchmark_height_, \ benchmark_height_, benchmark_width_, \ kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_12) { \ int diff = I420TestFilter_12(benchmark_width_, benchmark_height_, \ benchmark_height_, benchmark_width_, \ kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_12) { \ int diff = I444TestFilter_12(benchmark_width_, benchmark_height_, \ benchmark_height_, benchmark_width_, \ kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, DISABLED_##I420##name##SwapXY_##filter##_16) { \ int diff = I420TestFilter_16(benchmark_width_, benchmark_height_, \ benchmark_height_, benchmark_width_, \ kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, DISABLED_##I444##name##SwapXY_##filter##_16) { \ int diff = I444TestFilter_16(benchmark_width_, benchmark_height_, \ benchmark_height_, benchmark_width_, \ kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, NV12##name##SwapXY_##filter) { \ int diff = NV12TestFilter(benchmark_width_, benchmark_height_, \ benchmark_height_, benchmark_width_, \ kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } // Test scale to a specified size with all 4 filters. #ifndef DISABLE_SLOW_TESTS TEST_SCALESWAPXY1(, Scale, None, 0) TEST_SCALESWAPXY1(, Scale, Linear, 3) TEST_SCALESWAPXY1(, Scale, Bilinear, 3) TEST_SCALESWAPXY1(, Scale, Box, 3) #else #if defined(ENABLE_FULL_TESTS) TEST_SCALESWAPXY1(DISABLED_, Scale, None, 0) TEST_SCALESWAPXY1(DISABLED_, Scale, Linear, 3) TEST_SCALESWAPXY1(DISABLED_, Scale, Bilinear, 3) TEST_SCALESWAPXY1(DISABLED_, Scale, Box, 3) #else TEST_SCALESWAPXY1(DISABLED_, Scale, Bilinear, 3) TEST_SCALESWAPXY1(DISABLED_, Scale, Box, 3) #endif #endif #undef TEST_SCALESWAPXY1 #ifdef ENABLE_ROW_TESTS #ifdef HAS_SCALEROWDOWN2_SSSE3 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) { SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]); SIMD_ALIGNED(uint8_t dst_pixels_opt[64]); SIMD_ALIGNED(uint8_t dst_pixels_c[64]); memset(orig_pixels, 0, sizeof(orig_pixels)); memset(dst_pixels_opt, 0, sizeof(dst_pixels_opt)); memset(dst_pixels_c, 0, sizeof(dst_pixels_c)); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); if (!has_ssse3) { printf("Warning SSSE3 not detected; Skipping test.\n"); } else { // TL. orig_pixels[0] = 255u; orig_pixels[1] = 0u; orig_pixels[128 + 0] = 0u; orig_pixels[128 + 1] = 0u; // TR. orig_pixels[2] = 0u; orig_pixels[3] = 100u; orig_pixels[128 + 2] = 0u; orig_pixels[128 + 3] = 0u; // BL. orig_pixels[4] = 0u; orig_pixels[5] = 0u; orig_pixels[128 + 4] = 50u; orig_pixels[128 + 5] = 0u; // BR. orig_pixels[6] = 0u; orig_pixels[7] = 0u; orig_pixels[128 + 6] = 0u; orig_pixels[128 + 7] = 20u; // Odd. orig_pixels[126] = 4u; orig_pixels[127] = 255u; orig_pixels[128 + 126] = 16u; orig_pixels[128 + 127] = 255u; // Test regular half size. ScaleRowDown2Box_C(orig_pixels, 128, dst_pixels_c, 64); EXPECT_EQ(64u, dst_pixels_c[0]); EXPECT_EQ(25u, dst_pixels_c[1]); EXPECT_EQ(13u, dst_pixels_c[2]); EXPECT_EQ(5u, dst_pixels_c[3]); EXPECT_EQ(0u, dst_pixels_c[4]); EXPECT_EQ(133u, dst_pixels_c[63]); // Test Odd width version - Last pixel is just 1 horizontal pixel. ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); EXPECT_EQ(64u, dst_pixels_c[0]); EXPECT_EQ(25u, dst_pixels_c[1]); EXPECT_EQ(13u, dst_pixels_c[2]); EXPECT_EQ(5u, dst_pixels_c[3]); EXPECT_EQ(0u, dst_pixels_c[4]); EXPECT_EQ(10u, dst_pixels_c[63]); // Test one pixel less, should skip the last pixel. memset(dst_pixels_c, 0, sizeof(dst_pixels_c)); ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 63); EXPECT_EQ(64u, dst_pixels_c[0]); EXPECT_EQ(25u, dst_pixels_c[1]); EXPECT_EQ(13u, dst_pixels_c[2]); EXPECT_EQ(5u, dst_pixels_c[3]); EXPECT_EQ(0u, dst_pixels_c[4]); EXPECT_EQ(0u, dst_pixels_c[63]); // Test regular half size SSSE3. ScaleRowDown2Box_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); EXPECT_EQ(64u, dst_pixels_opt[0]); EXPECT_EQ(25u, dst_pixels_opt[1]); EXPECT_EQ(13u, dst_pixels_opt[2]); EXPECT_EQ(5u, dst_pixels_opt[3]); EXPECT_EQ(0u, dst_pixels_opt[4]); EXPECT_EQ(133u, dst_pixels_opt[63]); // Compare C and SSSE3 match. ScaleRowDown2Box_Odd_C(orig_pixels, 128, dst_pixels_c, 64); ScaleRowDown2Box_Odd_SSSE3(orig_pixels, 128, dst_pixels_opt, 64); for (int i = 0; i < 64; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } } } #endif // HAS_SCALEROWDOWN2_SSSE3 extern "C" void ScaleRowUp2_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); extern "C" void ScaleRowUp2_16_MMI(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); extern "C" void ScaleRowUp2_16_C(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); TEST_F(LibYUVScaleTest, TestScaleRowUp2_16) { SIMD_ALIGNED(uint16_t orig_pixels[640 * 2 + 1]); // 2 rows + 1 pixel overrun. SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]); SIMD_ALIGNED(uint16_t dst_pixels_c[1280]); memset(orig_pixels, 0, sizeof(orig_pixels)); memset(dst_pixels_opt, 1, sizeof(dst_pixels_opt)); memset(dst_pixels_c, 2, sizeof(dst_pixels_c)); for (int i = 0; i < 640 * 2 + 1; ++i) { orig_pixels[i] = i; } ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_c[0], 1280); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) int has_neon = TestCpuFlag(kCpuHasNEON); if (has_neon) { ScaleRowUp2_16_NEON(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); } else { ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); } #elif !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) int has_mmi = TestCpuFlag(kCpuHasMMI); if (has_mmi) { ScaleRowUp2_16_MMI(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); } else { ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); } #else ScaleRowUp2_16_C(&orig_pixels[0], 640, &dst_pixels_opt[0], 1280); #endif } for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } EXPECT_EQ(dst_pixels_c[0], (0 * 9 + 1 * 3 + 640 * 3 + 641 * 1 + 8) / 16); EXPECT_EQ(dst_pixels_c[1279], 800); } extern "C" void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst, int dst_width); TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) { SIMD_ALIGNED(uint16_t orig_pixels[2560 * 2]); SIMD_ALIGNED(uint16_t dst_pixels_c[1280]); SIMD_ALIGNED(uint16_t dst_pixels_opt[1280]); memset(orig_pixels, 0, sizeof(orig_pixels)); memset(dst_pixels_c, 1, sizeof(dst_pixels_c)); memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt)); for (int i = 0; i < 2560 * 2; ++i) { orig_pixels[i] = i; } ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_c[0], 1280); for (int i = 0; i < benchmark_pixels_div1280_; ++i) { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) int has_neon = TestCpuFlag(kCpuHasNEON); if (has_neon) { ScaleRowDown2Box_16_NEON(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280); } else { ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280); } #else ScaleRowDown2Box_16_C(&orig_pixels[0], 2560, &dst_pixels_opt[0], 1280); #endif } for (int i = 0; i < 1280; ++i) { EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); } EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4); EXPECT_EQ(dst_pixels_c[1279], 3839); } #endif // ENABLE_ROW_TESTS // Test scaling plane with 8 bit C vs 12 bit C and return maximum pixel // difference. // 0 = exact. static int TestPlaneFilter_16(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } int i; int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); int src_stride_y = Abs(src_width); int dst_y_plane_size = dst_width * dst_height; int dst_stride_y = dst_width; align_buffer_page_end(src_y, src_y_plane_size); align_buffer_page_end(src_y_16, src_y_plane_size * 2); align_buffer_page_end(dst_y_8, dst_y_plane_size); align_buffer_page_end(dst_y_16, dst_y_plane_size * 2); uint16_t* p_src_y_16 = reinterpret_cast(src_y_16); uint16_t* p_dst_y_16 = reinterpret_cast(dst_y_16); MemRandomize(src_y, src_y_plane_size); memset(dst_y_8, 0, dst_y_plane_size); memset(dst_y_16, 1, dst_y_plane_size * 2); for (i = 0; i < src_y_plane_size; ++i) { p_src_y_16[i] = src_y[i] & 255; } MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y_8, dst_stride_y, dst_width, dst_height, f); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. for (i = 0; i < benchmark_iterations; ++i) { ScalePlane_16(p_src_y_16, src_stride_y, src_width, src_height, p_dst_y_16, dst_stride_y, dst_width, dst_height, f); } // Expect an exact match. int max_diff = 0; for (i = 0; i < dst_y_plane_size; ++i) { int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]); if (abs_diff > max_diff) { max_diff = abs_diff; } } free_aligned_buffer_page_end(dst_y_8); free_aligned_buffer_page_end(dst_y_16); free_aligned_buffer_page_end(src_y); free_aligned_buffer_page_end(src_y_16); return max_diff; } // The following adjustments in dimensions ensure the scale factor will be // exactly achieved. // 2 is chroma subsample. #define DX(x, nom, denom) static_cast(((Abs(x) / nom + 1) / 2) * nom * 2) #define SX(x, nom, denom) static_cast(((x / nom + 1) / 2) * denom * 2) #define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) { \ int diff = TestPlaneFilter_16( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. #define TEST_FACTOR(name, nom, denom, boxdiff) \ TEST_FACTOR1(name, None, nom, denom, 0) \ TEST_FACTOR1(name, Linear, nom, denom, boxdiff) \ TEST_FACTOR1(name, Bilinear, nom, denom, boxdiff) \ TEST_FACTOR1(name, Box, nom, denom, boxdiff) TEST_FACTOR(2, 1, 2, 0) TEST_FACTOR(4, 1, 4, 0) // TEST_FACTOR(8, 1, 8, 0) Disable for benchmark performance. Takes 90 seconds. TEST_FACTOR(3by4, 3, 4, 1) TEST_FACTOR(3by8, 3, 8, 1) TEST_FACTOR(3, 1, 3, 0) #undef TEST_FACTOR1 #undef TEST_FACTOR #undef SX #undef DX TEST_F(LibYUVScaleTest, PlaneTest3x) { const int kSrcStride = 480; const int kDstStride = 160; const int kSize = kSrcStride * 3; align_buffer_page_end(orig_pixels, kSize); for (int i = 0; i < 480 * 3; ++i) { orig_pixels[i] = i; } align_buffer_page_end(dest_pixels, kDstStride); int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1, kFilterBilinear); } EXPECT_EQ(225, dest_pixels[0]); ScalePlane(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1, kFilterNone); EXPECT_EQ(225, dest_pixels[0]); free_aligned_buffer_page_end(dest_pixels); free_aligned_buffer_page_end(orig_pixels); } TEST_F(LibYUVScaleTest, PlaneTest4x) { const int kSrcStride = 640; const int kDstStride = 160; const int kSize = kSrcStride * 4; align_buffer_page_end(orig_pixels, kSize); for (int i = 0; i < 640 * 4; ++i) { orig_pixels[i] = i; } align_buffer_page_end(dest_pixels, kDstStride); int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1, kFilterBilinear); } EXPECT_EQ(66, dest_pixels[0]); ScalePlane(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1, kFilterNone); EXPECT_EQ(2, dest_pixels[0]); // expect the 3rd pixel of the 3rd row free_aligned_buffer_page_end(dest_pixels); free_aligned_buffer_page_end(orig_pixels); } // Intent is to test 200x50 to 50x200 but width and height can be parameters. TEST_F(LibYUVScaleTest, PlaneTestRotate_None) { const int kSize = benchmark_width_ * benchmark_height_; align_buffer_page_end(orig_pixels, kSize); for (int i = 0; i < kSize; ++i) { orig_pixels[i] = i; } align_buffer_page_end(dest_opt_pixels, kSize); align_buffer_page_end(dest_c_pixels, kSize); MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization. ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_, dest_c_pixels, benchmark_height_, benchmark_height_, benchmark_width_, kFilterNone); MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations_; ++i) { ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_, dest_opt_pixels, benchmark_height_, benchmark_height_, benchmark_width_, kFilterNone); } for (int i = 0; i < kSize; ++i) { EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]); } free_aligned_buffer_page_end(dest_c_pixels); free_aligned_buffer_page_end(dest_opt_pixels); free_aligned_buffer_page_end(orig_pixels); } TEST_F(LibYUVScaleTest, PlaneTestRotate_Bilinear) { const int kSize = benchmark_width_ * benchmark_height_; align_buffer_page_end(orig_pixels, kSize); for (int i = 0; i < kSize; ++i) { orig_pixels[i] = i; } align_buffer_page_end(dest_opt_pixels, kSize); align_buffer_page_end(dest_c_pixels, kSize); MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization. ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_, dest_c_pixels, benchmark_height_, benchmark_height_, benchmark_width_, kFilterBilinear); MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations_; ++i) { ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_, dest_opt_pixels, benchmark_height_, benchmark_height_, benchmark_width_, kFilterBilinear); } for (int i = 0; i < kSize; ++i) { EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]); } free_aligned_buffer_page_end(dest_c_pixels); free_aligned_buffer_page_end(dest_opt_pixels); free_aligned_buffer_page_end(orig_pixels); } // Intent is to test 200x50 to 50x200 but width and height can be parameters. TEST_F(LibYUVScaleTest, PlaneTestRotate_Box) { const int kSize = benchmark_width_ * benchmark_height_; align_buffer_page_end(orig_pixels, kSize); for (int i = 0; i < kSize; ++i) { orig_pixels[i] = i; } align_buffer_page_end(dest_opt_pixels, kSize); align_buffer_page_end(dest_c_pixels, kSize); MaskCpuFlags(disable_cpu_flags_); // Disable all CPU optimization. ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_, dest_c_pixels, benchmark_height_, benchmark_height_, benchmark_width_, kFilterBox); MaskCpuFlags(benchmark_cpu_info_); // Enable all CPU optimization. for (int i = 0; i < benchmark_iterations_; ++i) { ScalePlane(orig_pixels, benchmark_width_, benchmark_width_, benchmark_height_, dest_opt_pixels, benchmark_height_, benchmark_height_, benchmark_width_, kFilterBox); } for (int i = 0; i < kSize; ++i) { EXPECT_EQ(dest_c_pixels[i], dest_opt_pixels[i]); } free_aligned_buffer_page_end(dest_c_pixels); free_aligned_buffer_page_end(dest_opt_pixels); free_aligned_buffer_page_end(orig_pixels); } } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/scale_uv_test.cc000066400000000000000000000246721416500237200227260ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include "../unit_test/unit_test.h" #include "libyuv/cpu_id.h" #include "libyuv/scale_uv.h" namespace libyuv { #define STRINGIZE(line) #line #define FILELINESTR(file, line) file ":" STRINGIZE(line) #if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__) // SLOW TESTS are those that are unoptimized C code. // FULL TESTS are optimized but test many variations of the same code. #define ENABLE_FULL_TESTS #endif // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static int UVTestFilter(int src_width, int src_height, int dst_width, int dst_height, FilterMode f, int benchmark_iterations, int disable_cpu_flags, int benchmark_cpu_info) { if (!SizeValid(src_width, src_height, dst_width, dst_height)) { return 0; } int i, j; const int b = 0; // 128 to test for padding/stride. int64_t src_uv_plane_size = (Abs(src_width) + b * 2) * (Abs(src_height) + b * 2) * 2LL; int src_stride_uv = (b * 2 + Abs(src_width)) * 2; align_buffer_page_end(src_uv, src_uv_plane_size); if (!src_uv) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } MemRandomize(src_uv, src_uv_plane_size); int64_t dst_uv_plane_size = (dst_width + b * 2) * (dst_height + b * 2) * 2LL; int dst_stride_uv = (b * 2 + dst_width) * 2; align_buffer_page_end(dst_uv_c, dst_uv_plane_size); align_buffer_page_end(dst_uv_opt, dst_uv_plane_size); if (!dst_uv_c || !dst_uv_opt) { printf("Skipped. Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n"); return 0; } memset(dst_uv_c, 2, dst_uv_plane_size); memset(dst_uv_opt, 3, dst_uv_plane_size); // Warm up both versions for consistent benchmarks. MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv, dst_width, dst_height, f); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv, dst_width, dst_height, f); MaskCpuFlags(disable_cpu_flags); // Disable all CPU optimization. double c_time = get_time(); UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, src_height, dst_uv_c + (dst_stride_uv * b) + b * 2, dst_stride_uv, dst_width, dst_height, f); c_time = (get_time() - c_time); MaskCpuFlags(benchmark_cpu_info); // Enable all CPU optimization. double opt_time = get_time(); for (i = 0; i < benchmark_iterations; ++i) { UVScale(src_uv + (src_stride_uv * b) + b * 2, src_stride_uv, src_width, src_height, dst_uv_opt + (dst_stride_uv * b) + b * 2, dst_stride_uv, dst_width, dst_height, f); } opt_time = (get_time() - opt_time) / benchmark_iterations; // Report performance of C vs OPT printf("filter %d - %8d us C - %8d us OPT\n", f, static_cast(c_time * 1e6), static_cast(opt_time * 1e6)); // C version may be a little off from the optimized. Order of // operations may introduce rounding somewhere. So do a difference // of the buffers and look to see that the max difference isn't // over 2. int max_diff = 0; for (i = b; i < (dst_height + b); ++i) { for (j = b * 2; j < (dst_width + b) * 2; ++j) { int abs_diff = Abs(dst_uv_c[(i * dst_stride_uv) + j] - dst_uv_opt[(i * dst_stride_uv) + j]); if (abs_diff > max_diff) { max_diff = abs_diff; } } } free_aligned_buffer_page_end(dst_uv_c); free_aligned_buffer_page_end(dst_uv_opt); free_aligned_buffer_page_end(src_uv); return max_diff; } // The following adjustments in dimensions ensure the scale factor will be // exactly achieved. #define DX(x, nom, denom) static_cast((Abs(x) / nom) * nom) #define SX(x, nom, denom) static_cast((x / nom) * denom) #define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ TEST_F(LibYUVScaleTest, UVScaleDownBy##name##_##filter) { \ int diff = UVTestFilter( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } #if defined(ENABLE_FULL_TESTS) // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. #define TEST_FACTOR(name, nom, denom) \ TEST_FACTOR1(name, None, nom, denom, 0) \ TEST_FACTOR1(name, Linear, nom, denom, 3) \ TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ TEST_FACTOR1(name, Box, nom, denom, 3) #else // Test a scale factor with Bilinear. #define TEST_FACTOR(name, nom, denom) \ TEST_FACTOR1(name, Bilinear, nom, denom, 3) #endif TEST_FACTOR(2, 1, 2) TEST_FACTOR(4, 1, 4) // TEST_FACTOR(8, 1, 8) Disable for benchmark performance. TEST_FACTOR(3by4, 3, 4) TEST_FACTOR(3by8, 3, 8) TEST_FACTOR(3, 1, 3) #undef TEST_FACTOR1 #undef TEST_FACTOR #undef SX #undef DX #define TEST_SCALETO1(name, width, height, filter, max_diff) \ TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) { \ int diff = UVTestFilter(benchmark_width_, benchmark_height_, width, \ height, kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) { \ int diff = UVTestFilter(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } #if defined(ENABLE_FULL_TESTS) /// Test scale to a specified size with all 4 filters. #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(name, width, height, None, 0) \ TEST_SCALETO1(name, width, height, Linear, 3) \ TEST_SCALETO1(name, width, height, Bilinear, 3) #else #define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(name, width, height, Bilinear, 3) #endif TEST_SCALETO(UVScale, 1, 1) // TEST_SCALETO(UVScale, 256, 144) /* 128x72 * 2 */ TEST_SCALETO(UVScale, 320, 240) TEST_SCALETO(UVScale, 569, 480) TEST_SCALETO(UVScale, 640, 360) #ifndef DISABLE_SLOW_TESTS TEST_SCALETO(UVScale, 1280, 720) TEST_SCALETO(UVScale, 1920, 1080) #endif // DISABLE_SLOW_TESTS #undef TEST_SCALETO1 #undef TEST_SCALETO #define TEST_SCALESWAPXY1(name, filter, max_diff) \ TEST_F(LibYUVScaleTest, name##SwapXY_##filter) { \ int diff = \ UVTestFilter(benchmark_width_, benchmark_height_, benchmark_height_, \ benchmark_width_, kFilter##filter, benchmark_iterations_, \ disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } #if defined(ENABLE_FULL_TESTS) // Test scale with swapped width and height with all 3 filters. TEST_SCALESWAPXY1(UVScale, None, 0) TEST_SCALESWAPXY1(UVScale, Linear, 0) TEST_SCALESWAPXY1(UVScale, Bilinear, 0) #else TEST_SCALESWAPXY1(UVScale, Bilinear, 0) #endif #undef TEST_SCALESWAPXY1 TEST_F(LibYUVScaleTest, UVTest3x) { const int kSrcStride = 480 * 2; const int kDstStride = 160 * 2; const int kSize = kSrcStride * 3; align_buffer_page_end(orig_pixels, kSize); for (int i = 0; i < 480 * 3; ++i) { orig_pixels[i * 2 + 0] = i; orig_pixels[i * 2 + 1] = 255 - i; } align_buffer_page_end(dest_pixels, kDstStride); int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { UVScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1, kFilterBilinear); } EXPECT_EQ(225, dest_pixels[0]); EXPECT_EQ(255 - 225, dest_pixels[1]); UVScale(orig_pixels, kSrcStride, 480, 3, dest_pixels, kDstStride, 160, 1, kFilterNone); EXPECT_EQ(225, dest_pixels[0]); EXPECT_EQ(255 - 225, dest_pixels[1]); free_aligned_buffer_page_end(dest_pixels); free_aligned_buffer_page_end(orig_pixels); } TEST_F(LibYUVScaleTest, UVTest4x) { const int kSrcStride = 640 * 2; const int kDstStride = 160 * 2; const int kSize = kSrcStride * 4; align_buffer_page_end(orig_pixels, kSize); for (int i = 0; i < 640 * 4; ++i) { orig_pixels[i * 2 + 0] = i; orig_pixels[i * 2 + 1] = 255 - i; } align_buffer_page_end(dest_pixels, kDstStride); int iterations160 = (benchmark_width_ * benchmark_height_ + (160 - 1)) / 160 * benchmark_iterations_; for (int i = 0; i < iterations160; ++i) { UVScale(orig_pixels, kSrcStride, 640, 4, dest_pixels, kDstStride, 160, 1, kFilterBilinear); } EXPECT_EQ(66, dest_pixels[0]); EXPECT_EQ(190, dest_pixels[1]); UVScale(orig_pixels, kSrcStride, 64, 4, dest_pixels, kDstStride, 16, 1, kFilterNone); EXPECT_EQ(2, dest_pixels[0]); // expect the 3rd pixel of the 3rd row EXPECT_EQ(255 - 2, dest_pixels[1]); free_aligned_buffer_page_end(dest_pixels); free_aligned_buffer_page_end(orig_pixels); } } // namespace libyuv libyuv-0.0~git20220104.b91df1a/unit_test/testdata/000077500000000000000000000000001416500237200213555ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/unit_test/testdata/arm_v7.txt000066400000000000000000000004501416500237200233100ustar00rootroot00000000000000Processor : ARMv7 Processor rev 5 (v7l) BogoMIPS : 795.44 Features : swp half thumb fastmult vfp edsp iwmmxt thumbee vfpv3 vfpv3d16 CPU implementer : 0x56 CPU architecture: 7 CPU variant : 0x0 CPU part : 0x581 CPU revision : 5 Hardware : OLPC XO-1.75 Revision : 0000 Serial : 0000000000000000 libyuv-0.0~git20220104.b91df1a/unit_test/testdata/juno.txt000066400000000000000000000005641416500237200230760ustar00rootroot00000000000000Processor : AArch64 Processor rev 0 (aarch64) processor : 0 processor : 1 processor : 2 processor : 3 processor : 4 processor : 5 Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 CPU implementer : 0x41 CPU architecture: AArch64 CPU variant : 0x0 CPU part : 0xd07 CPU revision : 0 Hardware : Juno libyuv-0.0~git20220104.b91df1a/unit_test/testdata/mips.txt000066400000000000000000000004201416500237200230620ustar00rootroot00000000000000system type : generic-loongson-machine machine : loongson,generic processor : 0 isa : mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2 ASEs implemented : vz shadow register sets : 1 libyuv-0.0~git20220104.b91df1a/unit_test/testdata/mips_loongson2k.txt000066400000000000000000000002131416500237200252350ustar00rootroot00000000000000system type : Loongson2K-SBC machine : loongson,LS2k1000-EVP processor : 0 cpu model : Loongson-2K V0.3 FPU V0.1 BogoMIPS : 1980.41 libyuv-0.0~git20220104.b91df1a/unit_test/testdata/mips_loongson3.txt000066400000000000000000000005121416500237200250650ustar00rootroot00000000000000system type : generic-loongson-machine machine : Unknown processor : 0 cpu model : ICT Loongson-3 V0.9 FPU V0.1 model name : ICT Loongson-3A R3 (Loongson-3A3000) @ 1500MHz BogoMIPS : 2990.15 isa : mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2 ASEs implemented : dsp dsp2 vz shadow register sets : 1 libyuv-0.0~git20220104.b91df1a/unit_test/testdata/mips_loongson_mmi.txt000066400000000000000000000004521416500237200256470ustar00rootroot00000000000000system type : generic-loongson-machine machine : loongson,generic processor : 0 isa : mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2 ASEs implemented : vz loongson-mmi loongson-ext shadow register sets : 1 libyuv-0.0~git20220104.b91df1a/unit_test/testdata/mips_msa.txt000066400000000000000000000004241416500237200237260ustar00rootroot00000000000000system type : generic-loongson-machine machine : loongson,generic processor : 0 isa : mips1 mips2 mips3 mips4 mips5 mips32r1 mips32r2 mips64r1 mips64r2 ASEs implemented : vz msa shadow register sets : 1 libyuv-0.0~git20220104.b91df1a/unit_test/testdata/tegra3.txt000066400000000000000000000007071416500237200233070ustar00rootroot00000000000000Processor : ARMv7 Processor rev 9 (v7l) processor : 0 BogoMIPS : 1992.29 processor : 1 BogoMIPS : 1992.29 processor : 2 BogoMIPS : 1992.29 processor : 3 BogoMIPS : 1992.29 Features : swp half thumb fastmult vfp edsp neon vfpv3 CPU implementer : 0×41 CPU architecture: 7 CPU variant : 0×2 CPU part : 0xc09 CPU revision : 9 Hardware : cardhu Revision : 0000 libyuv-0.0~git20220104.b91df1a/unit_test/testdata/test0.jpg000066400000000000000000000006451416500237200231230ustar00rootroot00000000000000ÿØÿàJFIFHHÿÛC   (1#%(:3=<9387@H\N@DWE78PmQW_bghg>Mqypdx\egcÿ  ÿÄÿÚC~§—WÿÄÿÚ;ÀofvV#‡™ &bö¿ÿÄ!21q¡ÿÚ?K³(2Òíù>QsƒÿÄ!Q1aðÿÚ?!en1†(ù0Ü'Û©óÞ í4h#ùÆH]z5õoÿÚ5ÿÄ1Aaq‘!ѱÿÚ? 0éX¾ýˆ«‹4t€KµÕ«ÍF–.콪xG\G§0I­ˆ|@t0ÿ# ·ÔÿÙlibyuv-0.0~git20220104.b91df1a/unit_test/testdata/test1.jpg000066400000000000000000000013371416500237200231230ustar00rootroot00000000000000ÿØÿàJFIFHHÿÛC   (1#%(:3=<9387@H\N@DWE78PmQW_bghg>Mqypdx\egcÿÛC//cB8Bccccccccccccccccccccccccccccccccccccccccccccccccccÿ ÿÄÿÄÿÚ @&èôÌùi+*ËÿÄÿÚ;€oVvV#‡™ &bö¿ÿÄÿÚ?ñ'E»1¯ÿÄAÿÚ?öK_H³ic5r¿ÿÄ!21q¡ÿÚ?K³(2Òíù>QsƒÿÄ!Q1aðÿÚ?!un1”(ù0Ü'Û©óÞ í4h#ùÆH]z5õoÿÚ &aÔÿÄ!1AQÿÚ?T¨¿P‡°‹Äj&k*œÿÄ!QÿÚ?pá>ÑŽ áµÕ‘vC‚EL{ÿÄ1Aaq‘!ѱÿÚ?0éX¾ýŠë‹4t€KµÕ«ÍF–.콪xG\G§0I­ˆ|@t0ÿ# ·ÔÿÙlibyuv-0.0~git20220104.b91df1a/unit_test/testdata/test2.jpg000066400000000000000000000012551416500237200231230ustar00rootroot00000000000000ÿØÿàJFIFHHÿÛC   (1#%(:3=<9387@H\N@DWE78PmQW_bghg>Mqypdx\egcÿÛC//cB8Bccccccccccccccccccccccccccccccccccccccccccccccccccÿ "ÿÄÿÄÿÚ  ç(£ .-ÏÿÄÿÚ;€oVvV#‡™ &bö¿ÿÄ!ÿÚ?ÈSÿÄ2ÿÚ?ÒÇÿÄ!21q¡ÿÚ?K³(2Òíù>QsƒÿÄ!Q1aðÿÚ?!un1”(ù0Ü'Û©óÞ í4h#ùÆH]z5õoÿÚ _ÿÄ!ÿÚ?¡:vÿÄ!ÿÚ?W pÛÿÄ1Aaq‘!ѱÿÚ?0éX¾ýŠë‹4t€KµÕ«ÍF–.콪xG\G§0I­ˆ|@t0ÿ# ·ÔÿÙlibyuv-0.0~git20220104.b91df1a/unit_test/testdata/test3.jpg000066400000000000000000000013001416500237200231130ustar00rootroot00000000000000ÿØÿàJFIFHHÿÛC   (1#%(:3=<9387@H\N@DWE78PmQW_bghg>Mqypdx\egcÿÛC//cB8Bccccccccccccccccccccccccccccccccccccccccccccccccccÿ !ÿÄÿÄÿÚ C¢³ÊWÿÄÿÚ;€oVvV#‡™ &bö¿ÿÄ!ÿÚ?QÎŒuÿÄa!ÿÚ?¦Ù/„èðÿÄ!21q¡ÿÚ?K³(2Òíù>QsƒÿÄ!Q1aðÿÚ?!un1”(ù0Ü'Û©óÞ í4h#ùÆH]z5õoÿÚ .EÿÄ!1ÿÚ?SPºTÁgOÿÄ!ÿÚ?\Ê‘¿ÿÄ1Aaq‘!ѱÿÚ?0éX¾ýŠë‹4t€KµÕ«ÍF–.콪xG\G§0I­ˆ|@t0ÿ# ·ÔÿÙlibyuv-0.0~git20220104.b91df1a/unit_test/testdata/test4.jpg000066400000000000000000000012751416500237200231270ustar00rootroot00000000000000ÿØÿàJFIFHHÿÛC   (1#%(:3=<9387@H\N@DWE78PmQW_bghg>Mqypdx\egcÿÛC//cB8Bccccccccccccccccccccccccccccccccccccccccccccccccccÿ ÿÄÿÄÿÚ Ò˜é F!ÙÿÄÿÚ;€oVvV#‡™ &bö¿ÿÄ!ÿÚ?˜±½GÿÄ!ÿÚ?¶5¢áGÿÄ!21q¡ÿÚ?K³(2Òíù>QsƒÿÄ!Q1aðÿÚ?!un1”(ù0Ü'Û©óÞ í4h#ùÆH]z5õoÿÚ $¯ÿÄQ!1ÿÚ?YÊB`ŸiÿÄ!1aÿÚ?°×'Q¶AÿÄ1Aaq‘!ѱÿÚ?0éX¾ýŠë‹4t€KµÕ«ÍF–.콪xG\G§0I­ˆ|@t0ÿ# ·ÔÿÙlibyuv-0.0~git20220104.b91df1a/unit_test/unit_test.cc000066400000000000000000000431321416500237200220740ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "../unit_test/unit_test.h" #include // For getenv() #include #ifdef LIBYUV_USE_ABSL_FLAGS #include "absl/flags/flag.h" #include "absl/flags/parse.h" #endif #include "libyuv/cpu_id.h" unsigned int fastrand_seed = 0xfb; #ifdef LIBYUV_USE_ABSL_FLAGS ABSL_FLAG(int32_t, libyuv_width, 0, "width of test image."); ABSL_FLAG(int32_t, libyuv_height, 0, "height of test image."); ABSL_FLAG(int32_t, libyuv_repeat, 0, "number of times to repeat test."); ABSL_FLAG(int32_t, libyuv_flags, 0, "cpu flags for reference code. 1 = C, -1 = SIMD"); ABSL_FLAG(int32_t, libyuv_cpu_info, 0, "cpu flags for benchmark code. 1 = C, -1 = SIMD"); #else // Disable command line parameters if absl/flags disabled. static const int32_t FLAGS_libyuv_width = 0; static const int32_t FLAGS_libyuv_height = 0; static const int32_t FLAGS_libyuv_repeat = 0; static const int32_t FLAGS_libyuv_flags = 0; static const int32_t FLAGS_libyuv_cpu_info = 0; #endif #ifdef LIBYUV_USE_ABSL_FLAGS #define LIBYUV_GET_FLAG(f) absl::GetFlag(f) #else #define LIBYUV_GET_FLAG(f) f #endif // Test environment variable for disabling CPU features. Any non-zero value // to disable. Zero ignored to make it easy to set the variable on/off. #if !defined(__native_client__) && !defined(_M_ARM) static LIBYUV_BOOL TestEnv(const char* name) { const char* var = getenv(name); if (var) { if (var[0] != '0') { return LIBYUV_TRUE; } } return LIBYUV_FALSE; } #else // nacl does not support getenv(). static LIBYUV_BOOL TestEnv(const char*) { return LIBYUV_FALSE; } #endif int TestCpuEnv(int cpu_info) { #if defined(__arm__) || defined(__aarch64__) if (TestEnv("LIBYUV_DISABLE_NEON")) { cpu_info &= ~libyuv::kCpuHasNEON; } #endif #if defined(__mips__) && defined(__linux__) if (TestEnv("LIBYUV_DISABLE_MSA")) { cpu_info &= ~libyuv::kCpuHasMSA; } if (TestEnv("LIBYUV_DISABLE_MMI")) { cpu_info &= ~libyuv::kCpuHasMMI; } #endif #if !defined(__pnacl__) && !defined(__CLR_VER) && \ (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ defined(_M_IX86)) if (TestEnv("LIBYUV_DISABLE_X86")) { cpu_info &= ~libyuv::kCpuHasX86; } if (TestEnv("LIBYUV_DISABLE_SSE2")) { cpu_info &= ~libyuv::kCpuHasSSE2; } if (TestEnv("LIBYUV_DISABLE_SSSE3")) { cpu_info &= ~libyuv::kCpuHasSSSE3; } if (TestEnv("LIBYUV_DISABLE_SSE41")) { cpu_info &= ~libyuv::kCpuHasSSE41; } if (TestEnv("LIBYUV_DISABLE_SSE42")) { cpu_info &= ~libyuv::kCpuHasSSE42; } if (TestEnv("LIBYUV_DISABLE_AVX")) { cpu_info &= ~libyuv::kCpuHasAVX; } if (TestEnv("LIBYUV_DISABLE_AVX2")) { cpu_info &= ~libyuv::kCpuHasAVX2; } if (TestEnv("LIBYUV_DISABLE_ERMS")) { cpu_info &= ~libyuv::kCpuHasERMS; } if (TestEnv("LIBYUV_DISABLE_FMA3")) { cpu_info &= ~libyuv::kCpuHasFMA3; } if (TestEnv("LIBYUV_DISABLE_F16C")) { cpu_info &= ~libyuv::kCpuHasF16C; } if (TestEnv("LIBYUV_DISABLE_AVX512BW")) { cpu_info &= ~libyuv::kCpuHasAVX512BW; } if (TestEnv("LIBYUV_DISABLE_AVX512VL")) { cpu_info &= ~libyuv::kCpuHasAVX512VL; } if (TestEnv("LIBYUV_DISABLE_AVX512VBMI")) { cpu_info &= ~libyuv::kCpuHasAVX512VBMI; } if (TestEnv("LIBYUV_DISABLE_AVX512VBMI2")) { cpu_info &= ~libyuv::kCpuHasAVX512VBMI2; } if (TestEnv("LIBYUV_DISABLE_AVX512VBITALG")) { cpu_info &= ~libyuv::kCpuHasAVX512VBITALG; } if (TestEnv("LIBYUV_DISABLE_AVX512VPOPCNTDQ")) { cpu_info &= ~libyuv::kCpuHasAVX512VPOPCNTDQ; } if (TestEnv("LIBYUV_DISABLE_GFNI")) { cpu_info &= ~libyuv::kCpuHasGFNI; } #endif if (TestEnv("LIBYUV_DISABLE_ASM")) { cpu_info = libyuv::kCpuInitialized; } return cpu_info; } // For quicker unittests, default is 128 x 72. But when benchmarking, // default to 720p. Allow size to specify. // Set flags to -1 for benchmarking to avoid slower C code. LibYUVConvertTest::LibYUVConvertTest() : benchmark_iterations_(1), benchmark_width_(128), benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) { benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat); } if (benchmark_iterations_ > 1) { benchmark_width_ = 1280; benchmark_height_ = 720; } const char* width = getenv("LIBYUV_WIDTH"); if (width) { benchmark_width_ = atoi(width); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) { benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width); } const char* height = getenv("LIBYUV_HEIGHT"); if (height) { benchmark_height_ = atoi(height); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) { benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height); } const char* cpu_flags = getenv("LIBYUV_FLAGS"); if (cpu_flags) { disable_cpu_flags_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) { disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags); } const char* cpu_info = getenv("LIBYUV_CPU_INFO"); if (cpu_info) { benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) { benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info); } disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_); benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_); libyuv::MaskCpuFlags(benchmark_cpu_info_); benchmark_pixels_div1280_ = static_cast((static_cast(Abs(benchmark_width_)) * static_cast(Abs(benchmark_height_)) * static_cast(benchmark_iterations_) + 1279.0) / 1280.0); } LibYUVColorTest::LibYUVColorTest() : benchmark_iterations_(1), benchmark_width_(128), benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) { benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat); } if (benchmark_iterations_ > 1) { benchmark_width_ = 1280; benchmark_height_ = 720; } const char* width = getenv("LIBYUV_WIDTH"); if (width) { benchmark_width_ = atoi(width); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) { benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width); } const char* height = getenv("LIBYUV_HEIGHT"); if (height) { benchmark_height_ = atoi(height); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) { benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height); } const char* cpu_flags = getenv("LIBYUV_FLAGS"); if (cpu_flags) { disable_cpu_flags_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) { disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags); } const char* cpu_info = getenv("LIBYUV_CPU_INFO"); if (cpu_info) { benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) { benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info); } disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_); benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_); libyuv::MaskCpuFlags(benchmark_cpu_info_); benchmark_pixels_div1280_ = static_cast((static_cast(Abs(benchmark_width_)) * static_cast(Abs(benchmark_height_)) * static_cast(benchmark_iterations_) + 1279.0) / 1280.0); } LibYUVScaleTest::LibYUVScaleTest() : benchmark_iterations_(1), benchmark_width_(128), benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) { benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat); } if (benchmark_iterations_ > 1) { benchmark_width_ = 1280; benchmark_height_ = 720; } const char* width = getenv("LIBYUV_WIDTH"); if (width) { benchmark_width_ = atoi(width); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) { benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width); } const char* height = getenv("LIBYUV_HEIGHT"); if (height) { benchmark_height_ = atoi(height); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) { benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height); } const char* cpu_flags = getenv("LIBYUV_FLAGS"); if (cpu_flags) { disable_cpu_flags_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) { disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags); } const char* cpu_info = getenv("LIBYUV_CPU_INFO"); if (cpu_info) { benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) { benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info); } disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_); benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_); libyuv::MaskCpuFlags(benchmark_cpu_info_); benchmark_pixels_div1280_ = static_cast((static_cast(Abs(benchmark_width_)) * static_cast(Abs(benchmark_height_)) * static_cast(benchmark_iterations_) + 1279.0) / 1280.0); } LibYUVRotateTest::LibYUVRotateTest() : benchmark_iterations_(1), benchmark_width_(128), benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) { benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat); } if (benchmark_iterations_ > 1) { benchmark_width_ = 1280; benchmark_height_ = 720; } const char* width = getenv("LIBYUV_WIDTH"); if (width) { benchmark_width_ = atoi(width); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) { benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width); } const char* height = getenv("LIBYUV_HEIGHT"); if (height) { benchmark_height_ = atoi(height); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) { benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height); } const char* cpu_flags = getenv("LIBYUV_FLAGS"); if (cpu_flags) { disable_cpu_flags_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) { disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags); } const char* cpu_info = getenv("LIBYUV_CPU_INFO"); if (cpu_info) { benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) { benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info); } disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_); benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_); libyuv::MaskCpuFlags(benchmark_cpu_info_); benchmark_pixels_div1280_ = static_cast((static_cast(Abs(benchmark_width_)) * static_cast(Abs(benchmark_height_)) * static_cast(benchmark_iterations_) + 1279.0) / 1280.0); } LibYUVPlanarTest::LibYUVPlanarTest() : benchmark_iterations_(1), benchmark_width_(128), benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) { benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat); } if (benchmark_iterations_ > 1) { benchmark_width_ = 1280; benchmark_height_ = 720; } const char* width = getenv("LIBYUV_WIDTH"); if (width) { benchmark_width_ = atoi(width); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) { benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width); } const char* height = getenv("LIBYUV_HEIGHT"); if (height) { benchmark_height_ = atoi(height); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) { benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height); } const char* cpu_flags = getenv("LIBYUV_FLAGS"); if (cpu_flags) { disable_cpu_flags_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) { disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags); } const char* cpu_info = getenv("LIBYUV_CPU_INFO"); if (cpu_info) { benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) { benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info); } disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_); benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_); libyuv::MaskCpuFlags(benchmark_cpu_info_); benchmark_pixels_div1280_ = static_cast((static_cast(Abs(benchmark_width_)) * static_cast(Abs(benchmark_height_)) * static_cast(benchmark_iterations_) + 1279.0) / 1280.0); } LibYUVBaseTest::LibYUVBaseTest() : benchmark_iterations_(1), benchmark_width_(128), benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) { benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat); } if (benchmark_iterations_ > 1) { benchmark_width_ = 1280; benchmark_height_ = 720; } const char* width = getenv("LIBYUV_WIDTH"); if (width) { benchmark_width_ = atoi(width); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) { benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width); } const char* height = getenv("LIBYUV_HEIGHT"); if (height) { benchmark_height_ = atoi(height); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) { benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height); } const char* cpu_flags = getenv("LIBYUV_FLAGS"); if (cpu_flags) { disable_cpu_flags_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) { disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags); } const char* cpu_info = getenv("LIBYUV_CPU_INFO"); if (cpu_info) { benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) { benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info); } disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_); benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_); libyuv::MaskCpuFlags(benchmark_cpu_info_); benchmark_pixels_div1280_ = static_cast((static_cast(Abs(benchmark_width_)) * static_cast(Abs(benchmark_height_)) * static_cast(benchmark_iterations_) + 1279.0) / 1280.0); } LibYUVCompareTest::LibYUVCompareTest() : benchmark_iterations_(1), benchmark_width_(128), benchmark_height_(72), disable_cpu_flags_(1), benchmark_cpu_info_(-1) { const char* repeat = getenv("LIBYUV_REPEAT"); if (repeat) { benchmark_iterations_ = atoi(repeat); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_repeat)) { benchmark_iterations_ = LIBYUV_GET_FLAG(FLAGS_libyuv_repeat); } if (benchmark_iterations_ > 1) { benchmark_width_ = 1280; benchmark_height_ = 720; } const char* width = getenv("LIBYUV_WIDTH"); if (width) { benchmark_width_ = atoi(width); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_width)) { benchmark_width_ = LIBYUV_GET_FLAG(FLAGS_libyuv_width); } const char* height = getenv("LIBYUV_HEIGHT"); if (height) { benchmark_height_ = atoi(height); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_height)) { benchmark_height_ = LIBYUV_GET_FLAG(FLAGS_libyuv_height); } const char* cpu_flags = getenv("LIBYUV_FLAGS"); if (cpu_flags) { disable_cpu_flags_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_flags)) { disable_cpu_flags_ = LIBYUV_GET_FLAG(FLAGS_libyuv_flags); } const char* cpu_info = getenv("LIBYUV_CPU_INFO"); if (cpu_info) { benchmark_cpu_info_ = atoi(cpu_flags); // NOLINT } if (LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info)) { benchmark_cpu_info_ = LIBYUV_GET_FLAG(FLAGS_libyuv_cpu_info); } disable_cpu_flags_ = TestCpuEnv(disable_cpu_flags_); benchmark_cpu_info_ = TestCpuEnv(benchmark_cpu_info_); libyuv::MaskCpuFlags(benchmark_cpu_info_); benchmark_pixels_div1280_ = static_cast((static_cast(Abs(benchmark_width_)) * static_cast(Abs(benchmark_height_)) * static_cast(benchmark_iterations_) + 1279.0) / 1280.0); } int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); #ifdef LIBYUV_USE_ABSL_FLAGS absl::ParseCommandLine(argc, argv); #endif return RUN_ALL_TESTS(); } libyuv-0.0~git20220104.b91df1a/unit_test/unit_test.h000066400000000000000000000167241416500237200217450ustar00rootroot00000000000000/* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #ifndef UNIT_TEST_UNIT_TEST_H_ // NOLINT #define UNIT_TEST_UNIT_TEST_H_ #ifdef _WIN32 #include #else #include #include #endif #include #include "libyuv/basic_types.h" #ifndef SIMD_ALIGNED #if defined(_MSC_VER) && !defined(__CLR_VER) #define SIMD_ALIGNED(var) __declspec(align(16)) var #elif defined(__GNUC__) && !defined(__pnacl__) #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #else #define SIMD_ALIGNED(var) var #endif #endif static __inline int Abs(int v) { return v >= 0 ? v : -v; } static __inline float FAbs(float v) { return v >= 0 ? v : -v; } #define OFFBY 0 // Scaling uses 16.16 fixed point to step thru the source image, so a // maximum size of 32767.999 can be expressed. 32768 is valid because // the step is 1 beyond the image but not used. // Destination size is mainly constrained by valid scale step not the // absolute size, so it may be possible to relax the destination size // constraint. // Source size is unconstrained for most specialized scalers. e.g. // An image of 65536 scaled to half size would be valid. The test // could be relaxed for special scale factors. // If this test is removed, the scaling function should gracefully // fail with a return code. The test could be changed to know that // libyuv failed in a controlled way. static const int kMaxWidth = 32768; static const int kMaxHeight = 32768; static inline bool SizeValid(int src_width, int src_height, int dst_width, int dst_height) { if (src_width > kMaxWidth || src_height > kMaxHeight || dst_width > kMaxWidth || dst_height > kMaxHeight) { printf("Warning - size too large to test. Skipping\n"); return false; } return true; } #define align_buffer_page_end(var, size) \ uint8_t* var##_mem = \ reinterpret_cast(malloc(((size) + 4095 + 63) & ~4095)); \ uint8_t* var = reinterpret_cast( \ (intptr_t)(var##_mem + (((size) + 4095 + 63) & ~4095) - (size)) & ~63) #define free_aligned_buffer_page_end(var) \ free(var##_mem); \ var = 0 #ifdef WIN32 static inline double get_time() { LARGE_INTEGER t, f; QueryPerformanceCounter(&t); QueryPerformanceFrequency(&f); return static_cast(t.QuadPart) / static_cast(f.QuadPart); } #else static inline double get_time() { struct timeval t; struct timezone tzp; gettimeofday(&t, &tzp); return t.tv_sec + t.tv_usec * 1e-6; } #endif #ifndef SIMD_ALIGNED #if defined(_MSC_VER) && !defined(__CLR_VER) #define SIMD_ALIGNED(var) __declspec(align(16)) var #elif defined(__GNUC__) && !defined(__pnacl__) #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) #else #define SIMD_ALIGNED(var) var #endif #endif extern unsigned int fastrand_seed; inline int fastrand() { fastrand_seed = fastrand_seed * 214013u + 2531011u; return static_cast((fastrand_seed >> 16) & 0xffff); } // ubsan fails if dst is unaligned unless we use uint8 static inline void MemRandomize(uint8_t* dst, int64_t len) { int64_t i; for (i = 0; i < len - 1; i += 2) { int r = fastrand(); dst[0] = static_cast(r); dst[1] = static_cast(r >> 8); dst += 2; } for (; i < len; ++i) { *dst++ = fastrand(); } } class LibYUVColorTest : public ::testing::Test { protected: LibYUVColorTest(); int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; class LibYUVConvertTest : public ::testing::Test { protected: LibYUVConvertTest(); int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; class LibYUVScaleTest : public ::testing::Test { protected: LibYUVScaleTest(); int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; class LibYUVRotateTest : public ::testing::Test { protected: LibYUVRotateTest(); int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; class LibYUVPlanarTest : public ::testing::Test { protected: LibYUVPlanarTest(); int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; class LibYUVBaseTest : public ::testing::Test { protected: LibYUVBaseTest(); int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; class LibYUVCompareTest : public ::testing::Test { protected: LibYUVCompareTest(); int benchmark_iterations_; // Default 1. Use 1000 for benchmarking. int benchmark_width_; // Default 1280. Use 640 for benchmarking VGA. int benchmark_height_; // Default 720. Use 360 for benchmarking VGA. int benchmark_pixels_div1280_; // Total pixels to benchmark / 1280. int disable_cpu_flags_; // Default 1. Use -1 for benchmarking. int benchmark_cpu_info_; // Default -1. Use 1 to disable SIMD. }; #endif // UNIT_TEST_UNIT_TEST_H_ NOLINT libyuv-0.0~git20220104.b91df1a/unit_test/video_common_test.cc000066400000000000000000000132251416500237200235730ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include "../unit_test/unit_test.h" #include "libyuv/video_common.h" namespace libyuv { // Tests FourCC codes in video common, which are used for ConvertToI420(). static bool TestValidChar(uint32_t onecc) { return (onecc >= '0' && onecc <= '9') || (onecc >= 'A' && onecc <= 'Z') || (onecc >= 'a' && onecc <= 'z') || (onecc == ' ') || (onecc == 0xff); } static bool TestValidFourCC(uint32_t fourcc, int bpp) { if (!TestValidChar(fourcc & 0xff) || !TestValidChar((fourcc >> 8) & 0xff) || !TestValidChar((fourcc >> 16) & 0xff) || !TestValidChar((fourcc >> 24) & 0xff)) { return false; } if (bpp < 0 || bpp > 64) { return false; } return true; } TEST_F(LibYUVBaseTest, TestCanonicalFourCC) { EXPECT_EQ(static_cast(FOURCC_I420), CanonicalFourCC(FOURCC_IYUV)); EXPECT_EQ(static_cast(FOURCC_I420), CanonicalFourCC(FOURCC_YU12)); EXPECT_EQ(static_cast(FOURCC_I422), CanonicalFourCC(FOURCC_YU16)); EXPECT_EQ(static_cast(FOURCC_I444), CanonicalFourCC(FOURCC_YU24)); EXPECT_EQ(static_cast(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUYV)); EXPECT_EQ(static_cast(FOURCC_YUY2), CanonicalFourCC(FOURCC_YUVS)); EXPECT_EQ(static_cast(FOURCC_UYVY), CanonicalFourCC(FOURCC_HDYC)); EXPECT_EQ(static_cast(FOURCC_UYVY), CanonicalFourCC(FOURCC_2VUY)); EXPECT_EQ(static_cast(FOURCC_MJPG), CanonicalFourCC(FOURCC_JPEG)); EXPECT_EQ(static_cast(FOURCC_MJPG), CanonicalFourCC(FOURCC_DMB1)); EXPECT_EQ(static_cast(FOURCC_RAW), CanonicalFourCC(FOURCC_RGB3)); EXPECT_EQ(static_cast(FOURCC_24BG), CanonicalFourCC(FOURCC_BGR3)); EXPECT_EQ(static_cast(FOURCC_BGRA), CanonicalFourCC(FOURCC_CM32)); EXPECT_EQ(static_cast(FOURCC_RAW), CanonicalFourCC(FOURCC_CM24)); EXPECT_EQ(static_cast(FOURCC_RGBO), CanonicalFourCC(FOURCC_L555)); EXPECT_EQ(static_cast(FOURCC_RGBP), CanonicalFourCC(FOURCC_L565)); EXPECT_EQ(static_cast(FOURCC_RGBO), CanonicalFourCC(FOURCC_5551)); } TEST_F(LibYUVBaseTest, TestFourCC) { EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420)); EXPECT_TRUE(TestValidFourCC(FOURCC_I420, FOURCC_BPP_I420)); EXPECT_TRUE(TestValidFourCC(FOURCC_I422, FOURCC_BPP_I422)); EXPECT_TRUE(TestValidFourCC(FOURCC_I444, FOURCC_BPP_I444)); EXPECT_TRUE(TestValidFourCC(FOURCC_I400, FOURCC_BPP_I400)); EXPECT_TRUE(TestValidFourCC(FOURCC_NV21, FOURCC_BPP_NV21)); EXPECT_TRUE(TestValidFourCC(FOURCC_NV12, FOURCC_BPP_NV12)); EXPECT_TRUE(TestValidFourCC(FOURCC_YUY2, FOURCC_BPP_YUY2)); EXPECT_TRUE(TestValidFourCC(FOURCC_UYVY, FOURCC_BPP_UYVY)); EXPECT_TRUE(TestValidFourCC(FOURCC_M420, FOURCC_BPP_M420)); // deprecated. EXPECT_TRUE(TestValidFourCC(FOURCC_Q420, FOURCC_BPP_Q420)); // deprecated. EXPECT_TRUE(TestValidFourCC(FOURCC_ARGB, FOURCC_BPP_ARGB)); EXPECT_TRUE(TestValidFourCC(FOURCC_BGRA, FOURCC_BPP_BGRA)); EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR)); EXPECT_TRUE(TestValidFourCC(FOURCC_AR30, FOURCC_BPP_AR30)); EXPECT_TRUE(TestValidFourCC(FOURCC_AB30, FOURCC_BPP_AB30)); EXPECT_TRUE(TestValidFourCC(FOURCC_AR64, FOURCC_BPP_AR64)); EXPECT_TRUE(TestValidFourCC(FOURCC_AB64, FOURCC_BPP_AB64)); EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG)); EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW)); EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA)); EXPECT_TRUE(TestValidFourCC(FOURCC_RGBP, FOURCC_BPP_RGBP)); EXPECT_TRUE(TestValidFourCC(FOURCC_RGBO, FOURCC_BPP_RGBO)); EXPECT_TRUE(TestValidFourCC(FOURCC_R444, FOURCC_BPP_R444)); EXPECT_TRUE(TestValidFourCC(FOURCC_H420, FOURCC_BPP_H420)); EXPECT_TRUE(TestValidFourCC(FOURCC_H422, FOURCC_BPP_H422)); EXPECT_TRUE(TestValidFourCC(FOURCC_H010, FOURCC_BPP_H010)); EXPECT_TRUE(TestValidFourCC(FOURCC_H210, FOURCC_BPP_H210)); EXPECT_TRUE(TestValidFourCC(FOURCC_I010, FOURCC_BPP_I010)); EXPECT_TRUE(TestValidFourCC(FOURCC_I210, FOURCC_BPP_I210)); EXPECT_TRUE(TestValidFourCC(FOURCC_P010, FOURCC_BPP_P010)); EXPECT_TRUE(TestValidFourCC(FOURCC_P210, FOURCC_BPP_P210)); EXPECT_TRUE(TestValidFourCC(FOURCC_MJPG, FOURCC_BPP_MJPG)); EXPECT_TRUE(TestValidFourCC(FOURCC_YV12, FOURCC_BPP_YV12)); EXPECT_TRUE(TestValidFourCC(FOURCC_YV16, FOURCC_BPP_YV16)); EXPECT_TRUE(TestValidFourCC(FOURCC_YV24, FOURCC_BPP_YV24)); EXPECT_TRUE(TestValidFourCC(FOURCC_YU12, FOURCC_BPP_YU12)); EXPECT_TRUE(TestValidFourCC(FOURCC_IYUV, FOURCC_BPP_IYUV)); EXPECT_TRUE(TestValidFourCC(FOURCC_YU16, FOURCC_BPP_YU16)); EXPECT_TRUE(TestValidFourCC(FOURCC_YU24, FOURCC_BPP_YU24)); EXPECT_TRUE(TestValidFourCC(FOURCC_YUYV, FOURCC_BPP_YUYV)); EXPECT_TRUE(TestValidFourCC(FOURCC_YUVS, FOURCC_BPP_YUVS)); EXPECT_TRUE(TestValidFourCC(FOURCC_HDYC, FOURCC_BPP_HDYC)); EXPECT_TRUE(TestValidFourCC(FOURCC_2VUY, FOURCC_BPP_2VUY)); EXPECT_TRUE(TestValidFourCC(FOURCC_JPEG, FOURCC_BPP_JPEG)); EXPECT_TRUE(TestValidFourCC(FOURCC_DMB1, FOURCC_BPP_DMB1)); EXPECT_TRUE(TestValidFourCC(FOURCC_BA81, FOURCC_BPP_BA81)); EXPECT_TRUE(TestValidFourCC(FOURCC_RGB3, FOURCC_BPP_RGB3)); EXPECT_TRUE(TestValidFourCC(FOURCC_BGR3, FOURCC_BPP_BGR3)); EXPECT_TRUE(TestValidFourCC(FOURCC_H264, FOURCC_BPP_H264)); EXPECT_TRUE(TestValidFourCC(FOURCC_ANY, FOURCC_BPP_ANY)); } } // namespace libyuv libyuv-0.0~git20220104.b91df1a/util/000077500000000000000000000000001416500237200165035ustar00rootroot00000000000000libyuv-0.0~git20220104.b91df1a/util/Makefile000066400000000000000000000004731416500237200201470ustar00rootroot00000000000000psnr: psnr.cc ssim.cc psnr_main.cc ifeq ($(CXX),icl) $(CXX) /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc else $(CXX) -msse2 -O3 -fopenmp -static -o psnr psnr.cc ssim.cc psnr_main.cc -Wl,--strip-all endif # for MacOS # /usr/local/bin/g++-7 -msse2 -O3 -fopenmp -Bstatic -o psnr psnr.cc ssim.cc psnr_main.cc libyuv-0.0~git20220104.b91df1a/util/color.cc000066400000000000000000000077661416500237200201500ustar00rootroot00000000000000/* * Copyright 2021 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include // This utility computes values needed to generate yuvconstants based on // white point values. // The yuv formulas are tuned for 8 bit YUV channels. // For those MCs that can be represented as kr and kb: // Full range // float M[3][3] // {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}}; // float B[3] // {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255}; // Limited range // float M[3][3] // {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}}; // float B[3] // {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785}; // mc bt // 1 bt.709 KR = 0.2126; KB = 0.0722 // 4 fcc KR = 0.30; KB = 0.11 // 6 bt.601 KR = 0.299; KB = 0.114 // 7 SMPTE 240M KR = 0.212; KB = 0.087 // 10 bt2020 KR = 0.2627; KB = 0.0593 // BT.709 full range YUV to RGB reference // R = Y + V * 1.5748 // G = Y - U * 0.18732 - V * 0.46812 // B = Y + U * 1.8556 // KR = 0.2126 // KB = 0.0722 // https://mymusing.co/bt601-yuv-to-rgb-conversion-color/ // // Y contribution to R,G,B. Scale and bias. // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ // #define YB 32 /* 64 / 2 */ // // // U and V contributions to R,G,B. // #define UB 113 /* round(1.77200 * 64) */ // #define UG 22 /* round(0.34414 * 64) */ // #define VG 46 /* round(0.71414 * 64) */ // #define VR 90 /* round(1.40200 * 64) */ // // // Bias values to round, and subtract 128 from U and V. // #define BB (-UB * 128 + YB) // #define BG (UG * 128 + VG * 128 + YB) // #define BR (-VR * 128 + YB) int round(float v) { return (int)(v + 0.5); } int main(int argc, const char* argv[]) { if (argc < 2) { printf("color kr kb\n"); return -1; } float kr = atof(argv[1]); float kb = atof(argv[2]); float kg = 1 - kr - kb; float vr = 2 * (1 - kr); float ug = 2 * ((1 - kb) * kb / kg); float vg = 2 * ((1 - kr) * kr / kg); float ub = 2 * (1 - kb); printf("Full range\n"); printf("R = Y + V * %5f\n", vr); printf("G = Y - U * %6f - V * %6f\n", ug, vg); printf("B = Y + U * %5f\n", ub); printf("KR = %4f; ", kr); printf("KB = %4f\n", kb); // printf("KG = %4f\n", kg); // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ // #define YB 32 /* 64 / 2 */ // // // U and V contributions to R,G,B. printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub); printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug); printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg); printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr); vr = 255.f / 224.f * 2 * (1 - kr); ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg); vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg); ub = 255.f / 224.f * 2 * (1 - kb); printf("Limited range\n"); printf("R = (Y - 16) * 1.164 + V * %5f\n", vr); printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg); printf("B = (Y - 16) * 1.164 + U * %5f\n", ub); // printf("KG = %4f\n", kg); // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ // #define YB 32 /* 64 / 2 */ // // // U and V contributions to R,G,B. printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub); printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug); printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg); printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr); return 0; } libyuv-0.0~git20220104.b91df1a/util/compare.cc000066400000000000000000000036741416500237200204520ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include #include #include "libyuv/basic_types.h" #include "libyuv/compare.h" #include "libyuv/version.h" int main(int argc, char** argv) { if (argc < 1) { printf("libyuv compare v%d\n", LIBYUV_VERSION); printf("compare file1.yuv file2.yuv\n"); return -1; } char* name1 = argv[1]; char* name2 = (argc > 2) ? argv[2] : NULL; FILE* fin1 = fopen(name1, "rb"); FILE* fin2 = name2 ? fopen(name2, "rb") : NULL; const int kBlockSize = 32768; uint8_t buf1[kBlockSize]; uint8_t buf2[kBlockSize]; uint32_t hash1 = 5381; uint32_t hash2 = 5381; uint64_t sum_square_err = 0; uint64_t size_min = 0; int amt1 = 0; int amt2 = 0; do { amt1 = static_cast(fread(buf1, 1, kBlockSize, fin1)); if (amt1 > 0) { hash1 = libyuv::HashDjb2(buf1, amt1, hash1); } if (fin2) { amt2 = static_cast(fread(buf2, 1, kBlockSize, fin2)); if (amt2 > 0) { hash2 = libyuv::HashDjb2(buf2, amt2, hash2); } int amt_min = (amt1 < amt2) ? amt1 : amt2; size_min += amt_min; sum_square_err += libyuv::ComputeSumSquareError(buf1, buf2, amt_min); } } while (amt1 > 0 || amt2 > 0); printf("hash1 %x", hash1); if (fin2) { printf(", hash2 %x", hash2); double mse = static_cast(sum_square_err) / static_cast(size_min); printf(", mse %.2f", mse); double psnr = libyuv::SumSquareErrorToPsnr(sum_square_err, size_min); printf(", psnr %.2f\n", psnr); fclose(fin2); } fclose(fin1); } libyuv-0.0~git20220104.b91df1a/util/cpuid.c000066400000000000000000000076001416500237200177560ustar00rootroot00000000000000/* * Copyright 2012 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include #include "libyuv/cpu_id.h" #ifdef __cplusplus using namespace libyuv; #endif int main(int argc, const char* argv[]) { int cpu_flags = TestCpuFlag(-1); int has_arm = TestCpuFlag(kCpuHasARM); int has_mips = TestCpuFlag(kCpuHasMIPS); int has_x86 = TestCpuFlag(kCpuHasX86); (void)argc; (void)argv; #if defined(__i386__) || defined(__x86_64__) || \ defined(_M_IX86) || defined(_M_X64) if (has_x86) { int family, model, cpu_info[4]; // Vendor ID: // AuthenticAMD AMD processor // CentaurHauls Centaur processor // CyrixInstead Cyrix processor // GenuineIntel Intel processor // GenuineTMx86 Transmeta processor // Geode by NSC National Semiconductor processor // NexGenDriven NexGen processor // RiseRiseRise Rise Technology processor // SiS SiS SiS SiS processor // UMC UMC UMC UMC processor CpuId(0, 0, &cpu_info[0]); cpu_info[0] = cpu_info[1]; // Reorder output cpu_info[1] = cpu_info[3]; cpu_info[3] = 0; printf("Cpu Vendor: %s\n", (char*)(&cpu_info[0])); // CPU Family and Model // 3:0 - Stepping // 7:4 - Model // 11:8 - Family // 13:12 - Processor Type // 19:16 - Extended Model // 27:20 - Extended Family CpuId(1, 0, &cpu_info[0]); family = ((cpu_info[0] >> 8) & 0x0f) | ((cpu_info[0] >> 16) & 0xff0); model = ((cpu_info[0] >> 4) & 0x0f) | ((cpu_info[0] >> 12) & 0xf0); printf("Cpu Family %d (0x%x), Model %d (0x%x)\n", family, family, model, model); } #endif printf("Cpu Flags %x\n", cpu_flags); printf("Has ARM %x\n", has_arm); printf("Has MIPS %x\n", has_mips); printf("Has X86 %x\n", has_x86); if (has_arm) { int has_neon = TestCpuFlag(kCpuHasNEON); printf("Has NEON %x\n", has_neon); } if (has_mips) { int has_msa = TestCpuFlag(kCpuHasMSA); printf("Has MSA %x\n", has_msa); int has_mmi = TestCpuFlag(kCpuHasMMI); printf("Has MMI %x\n", has_mmi); } if (has_x86) { int has_sse2 = TestCpuFlag(kCpuHasSSE2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); int has_sse41 = TestCpuFlag(kCpuHasSSE41); int has_sse42 = TestCpuFlag(kCpuHasSSE42); int has_avx = TestCpuFlag(kCpuHasAVX); int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_erms = TestCpuFlag(kCpuHasERMS); int has_fma3 = TestCpuFlag(kCpuHasFMA3); int has_f16c = TestCpuFlag(kCpuHasF16C); int has_gfni = TestCpuFlag(kCpuHasGFNI); int has_avx512bw = TestCpuFlag(kCpuHasAVX512BW); int has_avx512vl = TestCpuFlag(kCpuHasAVX512VL); int has_avx512vbmi = TestCpuFlag(kCpuHasAVX512VBMI); int has_avx512vbmi2 = TestCpuFlag(kCpuHasAVX512VBMI2); int has_avx512vbitalg = TestCpuFlag(kCpuHasAVX512VBITALG); int has_avx512vpopcntdq = TestCpuFlag(kCpuHasAVX512VPOPCNTDQ); printf("Has SSE2 %x\n", has_sse2); printf("Has SSSE3 %x\n", has_ssse3); printf("Has SSE4.1 %x\n", has_sse41); printf("Has SSE4.2 %x\n", has_sse42); printf("Has AVX %x\n", has_avx); printf("Has AVX2 %x\n", has_avx2); printf("Has ERMS %x\n", has_erms); printf("Has FMA3 %x\n", has_fma3); printf("Has F16C %x\n", has_f16c); printf("Has GFNI %x\n", has_gfni); printf("Has AVX512BW %x\n", has_avx512bw); printf("Has AVX512VL %x\n", has_avx512vl); printf("Has AVX512VBMI %x\n", has_avx512vbmi); printf("Has AVX512VBMI2 %x\n", has_avx512vbmi2); printf("Has AVX512VBITALG %x\n", has_avx512vbitalg); printf("Has AVX512VPOPCNTDQ %x\n", has_avx512vpopcntdq); } return 0; } libyuv-0.0~git20220104.b91df1a/util/i444tonv12_eg.cc000066400000000000000000000016371416500237200212320ustar00rootroot00000000000000 #include "libyuv/convert.h" #include // for printf #include // for memset int main(int, char**) { unsigned char src_i444[640 * 400 * 3]; unsigned char dst_nv12[640 * 400 * 3 / 2]; for (size_t i = 0; i < sizeof(src_i444); ++i) { src_i444[i] = i & 255; } memset(dst_nv12, 0, sizeof(dst_nv12)); libyuv::I444ToNV12(&src_i444[0], 640, // source Y &src_i444[640 * 400], 640, // source U &src_i444[640 * 400 * 2], 640, // source V &dst_nv12[0], 640, // dest Y &dst_nv12[640 * 400], 640, // dest UV 640, 400); // width and height int checksum = 0; for (size_t i = 0; i < sizeof(dst_nv12); ++i) { checksum += dst_nv12[i]; } printf("checksum %x %s\n", checksum, checksum == 0x2ec0c00 ? "PASS" : "FAIL"); return 0; }libyuv-0.0~git20220104.b91df1a/util/psnr.cc000066400000000000000000000237331416500237200200040ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "./psnr.h" // NOLINT #ifdef _OPENMP #include #endif #ifdef _MSC_VER #include // For __cpuid() #endif #ifdef __cplusplus extern "C" { #endif typedef unsigned int uint32_t; // NOLINT #ifdef _MSC_VER typedef unsigned __int64 uint64_t; #else // COMPILER_MSVC #if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) typedef unsigned long uint64_t; // NOLINT #else // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__) typedef unsigned long long uint64_t; // NOLINT #endif // __LP64__ #endif // _MSC_VER // libyuv provides this function when linking library for jpeg support. #if !defined(HAVE_JPEG) #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ !defined(__aarch64__) #define HAS_SUMSQUAREERROR_NEON static uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { volatile uint32_t sse; asm volatile( "vmov.u8 q7, #0 \n" "vmov.u8 q9, #0 \n" "vmov.u8 q8, #0 \n" "vmov.u8 q10, #0 \n" "1: \n" "vld1.u8 {q0}, [%0]! \n" "vld1.u8 {q1}, [%1]! \n" "vsubl.u8 q2, d0, d2 \n" "vsubl.u8 q3, d1, d3 \n" "vmlal.s16 q7, d4, d4 \n" "vmlal.s16 q8, d6, d6 \n" "vmlal.s16 q8, d5, d5 \n" "vmlal.s16 q10, d7, d7 \n" "subs %2, %2, #16 \n" "bhi 1b \n" "vadd.u32 q7, q7, q8 \n" "vadd.u32 q9, q9, q10 \n" "vadd.u32 q10, q7, q9 \n" "vpaddl.u32 q1, q10 \n" "vadd.u64 d0, d2, d3 \n" "vmov.32 %3, d0[0] \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : : "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10"); return sse; } #elif !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #define HAS_SUMSQUAREERROR_NEON static uint32_t SumSquareError_NEON(const uint8_t* src_a, const uint8_t* src_b, int count) { volatile uint32_t sse; asm volatile( "eor v16.16b, v16.16b, v16.16b \n" "eor v18.16b, v18.16b, v18.16b \n" "eor v17.16b, v17.16b, v17.16b \n" "eor v19.16b, v19.16b, v19.16b \n" "1: \n" "ld1 {v0.16b}, [%0], #16 \n" "ld1 {v1.16b}, [%1], #16 \n" "subs %w2, %w2, #16 \n" "usubl v2.8h, v0.8b, v1.8b \n" "usubl2 v3.8h, v0.16b, v1.16b \n" "smlal v16.4s, v2.4h, v2.4h \n" "smlal v17.4s, v3.4h, v3.4h \n" "smlal2 v18.4s, v2.8h, v2.8h \n" "smlal2 v19.4s, v3.8h, v3.8h \n" "b.gt 1b \n" "add v16.4s, v16.4s, v17.4s \n" "add v18.4s, v18.4s, v19.4s \n" "add v19.4s, v16.4s, v18.4s \n" "addv s0, v19.4s \n" "fmov %w3, s0 \n" : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse) : : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); return sse; } #elif !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #define HAS_SUMSQUAREERROR_SSE2 __declspec(naked) static uint32_t SumSquareError_SSE2(const uint8_t* /*src_a*/, const uint8_t* /*src_b*/, int /*count*/) { __asm { mov eax, [esp + 4] // src_a mov edx, [esp + 8] // src_b mov ecx, [esp + 12] // count pxor xmm0, xmm0 pxor xmm5, xmm5 sub edx, eax wloop: movdqu xmm1, [eax] movdqu xmm2, [eax + edx] lea eax, [eax + 16] movdqu xmm3, xmm1 psubusb xmm1, xmm2 psubusb xmm2, xmm3 por xmm1, xmm2 movdqu xmm2, xmm1 punpcklbw xmm1, xmm5 punpckhbw xmm2, xmm5 pmaddwd xmm1, xmm1 pmaddwd xmm2, xmm2 paddd xmm0, xmm1 paddd xmm0, xmm2 sub ecx, 16 ja wloop pshufd xmm1, xmm0, 0EEh paddd xmm0, xmm1 pshufd xmm1, xmm0, 01h paddd xmm0, xmm1 movd eax, xmm0 ret } } #elif !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) #define HAS_SUMSQUAREERROR_SSE2 static uint32_t SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse; asm volatile( // NOLINT "pxor %%xmm0,%%xmm0 \n" "pxor %%xmm5,%%xmm5 \n" "sub %0,%1 \n" "1: \n" "movdqu (%0),%%xmm1 \n" "movdqu (%0,%1,1),%%xmm2 \n" "lea 0x10(%0),%0 \n" "movdqu %%xmm1,%%xmm3 \n" "psubusb %%xmm2,%%xmm1 \n" "psubusb %%xmm3,%%xmm2 \n" "por %%xmm2,%%xmm1 \n" "movdqu %%xmm1,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm1 \n" "punpckhbw %%xmm5,%%xmm2 \n" "pmaddwd %%xmm1,%%xmm1 \n" "pmaddwd %%xmm2,%%xmm2 \n" "paddd %%xmm1,%%xmm0 \n" "paddd %%xmm2,%%xmm0 \n" "sub $0x10,%2 \n" "ja 1b \n" "pshufd $0xee,%%xmm0,%%xmm1 \n" "paddd %%xmm1,%%xmm0 \n" "pshufd $0x1,%%xmm0,%%xmm1 \n" "paddd %%xmm1,%%xmm0 \n" "movd %%xmm0,%3 \n" : "+r"(src_a), // %0 "+r"(src_b), // %1 "+r"(count), // %2 "=g"(sse) // %3 : : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" #endif ); // NOLINT return sse; } #endif // LIBYUV_DISABLE_X86 etc #if defined(HAS_SUMSQUAREERROR_SSE2) #if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__) static __inline void __cpuid(int cpu_info[4], int info_type) { asm volatile( // NOLINT "mov %%ebx, %%edi \n" "cpuid \n" "xchg %%edi, %%ebx \n" : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) : "a"(info_type)); } // For gcc/clang but not clangcl. #elif !defined(_MSC_VER) && (defined(__i386__) || defined(__x86_64__)) static __inline void __cpuid(int cpu_info[4], int info_type) { asm volatile( // NOLINT "cpuid \n" : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) : "a"(info_type)); } #endif static int CpuHasSSE2() { #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) int cpu_info[4]; __cpuid(cpu_info, 1); if (cpu_info[3] & 0x04000000) { return 1; } #endif return 0; } #endif // HAS_SUMSQUAREERROR_SSE2 static uint32_t SumSquareError_C(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t sse = 0u; for (int x = 0; x < count; ++x) { int diff = src_a[x] - src_b[x]; sse += static_cast(diff * diff); } return sse; } double ComputeSumSquareError(const uint8_t* src_a, const uint8_t* src_b, int count) { uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b, int count) = SumSquareError_C; #if defined(HAS_SUMSQUAREERROR_NEON) SumSquareError = SumSquareError_NEON; #endif #if defined(HAS_SUMSQUAREERROR_SSE2) if (CpuHasSSE2()) { SumSquareError = SumSquareError_SSE2; } #endif const int kBlockSize = 1 << 15; uint64_t sse = 0; #ifdef _OPENMP #pragma omp parallel for reduction(+ : sse) #endif for (int i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) { sse += SumSquareError(src_a + i, src_b + i, kBlockSize); } src_a += count & ~(kBlockSize - 1); src_b += count & ~(kBlockSize - 1); int remainder = count & (kBlockSize - 1) & ~15; if (remainder) { sse += SumSquareError(src_a, src_b, remainder); src_a += remainder; src_b += remainder; } remainder = count & 15; if (remainder) { sse += SumSquareError_C(src_a, src_b, remainder); } return static_cast(sse); } #endif // PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse) // Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match). double ComputePSNR(double sse, double size) { const double kMINSSE = 255.0 * 255.0 * size / pow(10.0, kMaxPSNR / 10.0); if (sse <= kMINSSE) { sse = kMINSSE; // Produces max PSNR of 128 } return 10.0 * log10(255.0 * 255.0 * size / sse); } #ifdef __cplusplus } // extern "C" #endif libyuv-0.0~git20220104.b91df1a/util/psnr.h000066400000000000000000000026101416500237200176350ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ // Get PSNR for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format #ifndef UTIL_PSNR_H_ // NOLINT #define UTIL_PSNR_H_ #include // For log10() #ifdef __cplusplus extern "C" { #endif #if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED) typedef unsigned char uint8_t; #define UINT8_TYPE_DEFINED #endif static const double kMaxPSNR = 128.0; // libyuv provides this function when linking library for jpeg support. // TODO(fbarchard): make psnr lib compatible subset of libyuv. #if !defined(HAVE_JPEG) // Computer Sum of Squared Error (SSE). // Pass this to ComputePSNR for final result. double ComputeSumSquareError(const uint8_t* src_a, const uint8_t* src_b, int count); #endif // PSNR formula: psnr = 10 * log10 (Peak Signal^2 * size / sse) // Returns 128.0 (kMaxPSNR) if sse is 0 (perfect match). double ComputePSNR(double sse, double size); #ifdef __cplusplus } // extern "C" #endif #endif // UTIL_PSNR_H_ // NOLINT libyuv-0.0~git20220104.b91df1a/util/psnr_main.cc000066400000000000000000000517161416500237200210120ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ // Get PSNR or SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format // To build: g++ -O3 -o psnr psnr.cc ssim.cc psnr_main.cc // or VisualC: cl /Ox psnr.cc ssim.cc psnr_main.cc // // To enable OpenMP and SSE2 // gcc: g++ -msse2 -O3 -fopenmp -o psnr psnr.cc ssim.cc psnr_main.cc // vc: cl /arch:SSE2 /Ox /openmp psnr.cc ssim.cc psnr_main.cc // // Usage: psnr org_seq rec_seq -s width height [-skip skip_org skip_rec] #ifndef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include #ifdef _OPENMP #include #endif #include "./psnr.h" #include "./ssim.h" #ifdef HAVE_JPEG #include "libyuv/compare.h" #include "libyuv/convert.h" #endif struct metric { double y, u, v, all; double min_y, min_u, min_v, min_all; double global_y, global_u, global_v, global_all; int min_frame; }; // options bool verbose = false; bool quiet = false; bool show_name = false; bool do_swap_uv = false; bool do_psnr = false; bool do_ssim = false; bool do_mse = false; bool do_lssim = false; int image_width = 0, image_height = 0; int fileindex_org = 0; // argv argument contains the source file name. int fileindex_rec = 0; // argv argument contains the destination file name. int num_rec = 0; int num_skip_org = 0; int num_skip_rec = 0; int num_frames = 0; #ifdef _OPENMP int num_threads = 0; #endif // Parse PYUV format. ie name.1920x800_24Hz_P420.yuv bool ExtractResolutionFromFilename(const char* name, int* width_ptr, int* height_ptr) { // Isolate the .width_height. section of the filename by searching for a // dot or underscore followed by a digit. for (int i = 0; name[i]; ++i) { if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' && name[i + 1] <= '9') { int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT if (2 == n) { return true; } } } #ifdef HAVE_JPEG // Try parsing file as a jpeg. FILE* const file_org = fopen(name, "rb"); if (file_org == NULL) { fprintf(stderr, "Cannot open %s\n", name); return false; } fseek(file_org, 0, SEEK_END); size_t total_size = ftell(file_org); fseek(file_org, 0, SEEK_SET); uint8_t* const ch_org = new uint8_t[total_size]; memset(ch_org, 0, total_size); size_t bytes_org = fread(ch_org, sizeof(uint8_t), total_size, file_org); fclose(file_org); if (bytes_org == total_size) { if (0 == libyuv::MJPGSize(ch_org, total_size, width_ptr, height_ptr)) { delete[] ch_org; return true; } } delete[] ch_org; #endif // HAVE_JPEG return false; } // Scale Y channel from 16..240 to 0..255. // This can be useful when comparing codecs that are inconsistant about Y uint8_t ScaleY(uint8_t y) { int ny = (y - 16) * 256 / 224; if (ny < 0) { ny = 0; } if (ny > 255) { ny = 255; } return static_cast(ny); } // MSE = Mean Square Error double GetMSE(double sse, double size) { return sse / size; } void PrintHelp(const char* program) { printf("%s [-options] org_seq rec_seq [rec_seq2.. etc]\n", program); #ifdef HAVE_JPEG printf("jpeg or raw YUV 420 supported.\n"); #endif printf("options:\n"); printf( " -s .... specify YUV size, mandatory if none of the " "sequences have the\n"); printf( " resolution embedded in their filename (ie. " "name.1920x800_24Hz_P420.yuv)\n"); printf(" -psnr .................. compute PSNR (default)\n"); printf(" -ssim .................. compute SSIM\n"); printf(" -mse ................... compute MSE\n"); printf(" -swap .................. Swap U and V plane\n"); printf(" -skip ...... Number of frame to skip of org and rec\n"); printf(" -frames .......... Number of frames to compare\n"); #ifdef _OPENMP printf(" -t ............... Number of threads\n"); #endif printf(" -n ..................... Show file name\n"); printf(" -v ..................... verbose++\n"); printf(" -q ..................... quiet\n"); printf(" -h ..................... this help\n"); exit(0); } void ParseOptions(int argc, const char* argv[]) { if (argc <= 1) { PrintHelp(argv[0]); } for (int c = 1; c < argc; ++c) { if (!strcmp(argv[c], "-v")) { verbose = true; } else if (!strcmp(argv[c], "-q")) { quiet = true; } else if (!strcmp(argv[c], "-n")) { show_name = true; } else if (!strcmp(argv[c], "-psnr")) { do_psnr = true; } else if (!strcmp(argv[c], "-mse")) { do_mse = true; } else if (!strcmp(argv[c], "-ssim")) { do_ssim = true; } else if (!strcmp(argv[c], "-lssim")) { do_ssim = true; do_lssim = true; } else if (!strcmp(argv[c], "-swap")) { do_swap_uv = true; } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) { PrintHelp(argv[0]); } else if (!strcmp(argv[c], "-s") && c + 2 < argc) { image_width = atoi(argv[++c]); // NOLINT image_height = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-skip") && c + 2 < argc) { num_skip_org = atoi(argv[++c]); // NOLINT num_skip_rec = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) { num_frames = atoi(argv[++c]); // NOLINT #ifdef _OPENMP } else if (!strcmp(argv[c], "-t") && c + 1 < argc) { num_threads = atoi(argv[++c]); // NOLINT #endif } else if (argv[c][0] == '-') { fprintf(stderr, "Unknown option. %s\n", argv[c]); } else if (fileindex_org == 0) { fileindex_org = c; } else if (fileindex_rec == 0) { fileindex_rec = c; num_rec = 1; } else { ++num_rec; } } if (fileindex_org == 0 || fileindex_rec == 0) { fprintf(stderr, "Missing filenames\n"); PrintHelp(argv[0]); } if (num_skip_org < 0 || num_skip_rec < 0) { fprintf(stderr, "Skipped frames incorrect\n"); PrintHelp(argv[0]); } if (num_frames < 0) { fprintf(stderr, "Number of frames incorrect\n"); PrintHelp(argv[0]); } if (image_width == 0 || image_height == 0) { int org_width, org_height; int rec_width, rec_height; bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org], &org_width, &org_height); bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec], &rec_width, &rec_height); if (org_res_avail) { if (rec_res_avail) { if ((org_width == rec_width) && (org_height == rec_height)) { image_width = org_width; image_height = org_height; } else { fprintf(stderr, "Sequences have different resolutions.\n"); PrintHelp(argv[0]); } } else { image_width = org_width; image_height = org_height; } } else if (rec_res_avail) { image_width = rec_width; image_height = rec_height; } else { fprintf(stderr, "Missing dimensions.\n"); PrintHelp(argv[0]); } } } bool UpdateMetrics(uint8_t* ch_org, uint8_t* ch_rec, const int y_size, const int uv_size, const size_t total_size, int number_of_frames, metric* cur_distortion_psnr, metric* distorted_frame, bool compute_psnr) { const int uv_offset = (do_swap_uv ? uv_size : 0); const uint8_t* const u_org = ch_org + y_size + uv_offset; const uint8_t* const u_rec = ch_rec + y_size; const uint8_t* const v_org = ch_org + y_size + (uv_size - uv_offset); const uint8_t* const v_rec = ch_rec + y_size + uv_size; if (compute_psnr) { #ifdef HAVE_JPEG double y_err = static_cast( libyuv::ComputeSumSquareError(ch_org, ch_rec, y_size)); double u_err = static_cast( libyuv::ComputeSumSquareError(u_org, u_rec, uv_size)); double v_err = static_cast( libyuv::ComputeSumSquareError(v_org, v_rec, uv_size)); #else double y_err = ComputeSumSquareError(ch_org, ch_rec, y_size); double u_err = ComputeSumSquareError(u_org, u_rec, uv_size); double v_err = ComputeSumSquareError(v_org, v_rec, uv_size); #endif const double total_err = y_err + u_err + v_err; cur_distortion_psnr->global_y += y_err; cur_distortion_psnr->global_u += u_err; cur_distortion_psnr->global_v += v_err; cur_distortion_psnr->global_all += total_err; distorted_frame->y = ComputePSNR(y_err, static_cast(y_size)); distorted_frame->u = ComputePSNR(u_err, static_cast(uv_size)); distorted_frame->v = ComputePSNR(v_err, static_cast(uv_size)); distorted_frame->all = ComputePSNR(total_err, static_cast(total_size)); } else { distorted_frame->y = CalcSSIM(ch_org, ch_rec, image_width, image_height); distorted_frame->u = CalcSSIM(u_org, u_rec, (image_width + 1) / 2, (image_height + 1) / 2); distorted_frame->v = CalcSSIM(v_org, v_rec, (image_width + 1) / 2, (image_height + 1) / 2); distorted_frame->all = (distorted_frame->y + distorted_frame->u + distorted_frame->v) / total_size; distorted_frame->y /= y_size; distorted_frame->u /= uv_size; distorted_frame->v /= uv_size; if (do_lssim) { distorted_frame->all = CalcLSSIM(distorted_frame->all); distorted_frame->y = CalcLSSIM(distorted_frame->y); distorted_frame->u = CalcLSSIM(distorted_frame->u); distorted_frame->v = CalcLSSIM(distorted_frame->v); } } cur_distortion_psnr->y += distorted_frame->y; cur_distortion_psnr->u += distorted_frame->u; cur_distortion_psnr->v += distorted_frame->v; cur_distortion_psnr->all += distorted_frame->all; bool ismin = false; if (distorted_frame->y < cur_distortion_psnr->min_y) { cur_distortion_psnr->min_y = distorted_frame->y; } if (distorted_frame->u < cur_distortion_psnr->min_u) { cur_distortion_psnr->min_u = distorted_frame->u; } if (distorted_frame->v < cur_distortion_psnr->min_v) { cur_distortion_psnr->min_v = distorted_frame->v; } if (distorted_frame->all < cur_distortion_psnr->min_all) { cur_distortion_psnr->min_all = distorted_frame->all; cur_distortion_psnr->min_frame = number_of_frames; ismin = true; } return ismin; } int main(int argc, const char* argv[]) { ParseOptions(argc, argv); if (!do_psnr && !do_ssim) { do_psnr = true; } #ifdef _OPENMP if (num_threads) { omp_set_num_threads(num_threads); } if (verbose) { printf("OpenMP %d procs\n", omp_get_num_procs()); } #endif // Open original file (first file argument) FILE* const file_org = fopen(argv[fileindex_org], "rb"); if (file_org == NULL) { fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]); exit(1); } // Open all files to compare to FILE** file_rec = new FILE*[num_rec]; memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "rb"); if (file_rec[cur_rec] == NULL) { fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]); fclose(file_org); for (int i = 0; i < cur_rec; ++i) { fclose(file_rec[i]); } delete[] file_rec; exit(1); } } const int y_size = image_width * image_height; const int uv_size = ((image_width + 1) / 2) * ((image_height + 1) / 2); const size_t total_size = y_size + 2 * uv_size; // NOLINT #if defined(_MSC_VER) _fseeki64( file_org, static_cast<__int64>(num_skip_org) * static_cast<__int64>(total_size), SEEK_SET); #else fseek(file_org, num_skip_org * total_size, SEEK_SET); #endif for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { #if defined(_MSC_VER) _fseeki64( file_rec[cur_rec], static_cast<__int64>(num_skip_rec) * static_cast<__int64>(total_size), SEEK_SET); #else fseek(file_rec[cur_rec], num_skip_rec * total_size, SEEK_SET); #endif } uint8_t* const ch_org = new uint8_t[total_size]; uint8_t* const ch_rec = new uint8_t[total_size]; if (ch_org == NULL || ch_rec == NULL) { fprintf(stderr, "No memory available\n"); fclose(file_org); for (int i = 0; i < num_rec; ++i) { fclose(file_rec[i]); } delete[] ch_org; delete[] ch_rec; delete[] file_rec; exit(1); } metric* const distortion_psnr = new metric[num_rec]; metric* const distortion_ssim = new metric[num_rec]; for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { metric* cur_distortion_psnr = &distortion_psnr[cur_rec]; cur_distortion_psnr->y = 0.0; cur_distortion_psnr->u = 0.0; cur_distortion_psnr->v = 0.0; cur_distortion_psnr->all = 0.0; cur_distortion_psnr->min_y = kMaxPSNR; cur_distortion_psnr->min_u = kMaxPSNR; cur_distortion_psnr->min_v = kMaxPSNR; cur_distortion_psnr->min_all = kMaxPSNR; cur_distortion_psnr->min_frame = 0; cur_distortion_psnr->global_y = 0.0; cur_distortion_psnr->global_u = 0.0; cur_distortion_psnr->global_v = 0.0; cur_distortion_psnr->global_all = 0.0; distortion_ssim[cur_rec] = cur_distortion_psnr[cur_rec]; } if (verbose) { printf("Size: %dx%d\n", image_width, image_height); } if (!quiet) { printf("Frame"); if (do_psnr) { printf("\t PSNR-Y \t PSNR-U \t PSNR-V \t PSNR-All \t Frame"); } if (do_ssim) { printf("\t SSIM-Y\t SSIM-U\t SSIM-V\t SSIM-All\t Frame"); } if (show_name) { printf("\tName\n"); } else { printf("\n"); } } int number_of_frames; for (number_of_frames = 0;; ++number_of_frames) { if (num_frames && number_of_frames >= num_frames) { break; } size_t bytes_org = fread(ch_org, sizeof(uint8_t), total_size, file_org); if (bytes_org < total_size) { #ifdef HAVE_JPEG // Try parsing file as a jpeg. uint8_t* const ch_jpeg = new uint8_t[bytes_org]; memcpy(ch_jpeg, ch_org, bytes_org); memset(ch_org, 0, total_size); if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_org, ch_org, image_width, ch_org + y_size, (image_width + 1) / 2, ch_org + y_size + uv_size, (image_width + 1) / 2, image_width, image_height, image_width, image_height)) { delete[] ch_jpeg; break; } delete[] ch_jpeg; #else break; #endif // HAVE_JPEG } for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { size_t bytes_rec = fread(ch_rec, sizeof(uint8_t), total_size, file_rec[cur_rec]); if (bytes_rec < total_size) { #ifdef HAVE_JPEG // Try parsing file as a jpeg. uint8_t* const ch_jpeg = new uint8_t[bytes_rec]; memcpy(ch_jpeg, ch_rec, bytes_rec); memset(ch_rec, 0, total_size); if (0 != libyuv::MJPGToI420(ch_jpeg, bytes_rec, ch_rec, image_width, ch_rec + y_size, (image_width + 1) / 2, ch_rec + y_size + uv_size, (image_width + 1) / 2, image_width, image_height, image_width, image_height)) { delete[] ch_jpeg; break; } delete[] ch_jpeg; #else break; #endif // HAVE_JPEG } if (verbose) { printf("%5d", number_of_frames); } if (do_psnr) { metric distorted_frame = {}; metric* cur_distortion_psnr = &distortion_psnr[cur_rec]; bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size, number_of_frames, cur_distortion_psnr, &distorted_frame, true); if (verbose) { printf("\t%10.6f", distorted_frame.y); printf("\t%10.6f", distorted_frame.u); printf("\t%10.6f", distorted_frame.v); printf("\t%10.6f", distorted_frame.all); printf("\t%5s", ismin ? "min" : ""); } } if (do_ssim) { metric distorted_frame = {}; metric* cur_distortion_ssim = &distortion_ssim[cur_rec]; bool ismin = UpdateMetrics(ch_org, ch_rec, y_size, uv_size, total_size, number_of_frames, cur_distortion_ssim, &distorted_frame, false); if (verbose) { printf("\t%10.6f", distorted_frame.y); printf("\t%10.6f", distorted_frame.u); printf("\t%10.6f", distorted_frame.v); printf("\t%10.6f", distorted_frame.all); printf("\t%5s", ismin ? "min" : ""); } } if (verbose) { if (show_name) { printf("\t%s", argv[fileindex_rec + cur_rec]); } printf("\n"); } } } // Final PSNR computation. for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { metric* cur_distortion_psnr = &distortion_psnr[cur_rec]; metric* cur_distortion_ssim = &distortion_ssim[cur_rec]; if (number_of_frames > 0) { const double norm = 1. / static_cast(number_of_frames); cur_distortion_psnr->y *= norm; cur_distortion_psnr->u *= norm; cur_distortion_psnr->v *= norm; cur_distortion_psnr->all *= norm; cur_distortion_ssim->y *= norm; cur_distortion_ssim->u *= norm; cur_distortion_ssim->v *= norm; cur_distortion_ssim->all *= norm; } if (do_psnr) { const double global_psnr_y = ComputePSNR(cur_distortion_psnr->global_y, static_cast(y_size) * number_of_frames); const double global_psnr_u = ComputePSNR(cur_distortion_psnr->global_u, static_cast(uv_size) * number_of_frames); const double global_psnr_v = ComputePSNR(cur_distortion_psnr->global_v, static_cast(uv_size) * number_of_frames); const double global_psnr_all = ComputePSNR(cur_distortion_psnr->global_all, static_cast(total_size) * number_of_frames); printf("Global:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_psnr_y, global_psnr_u, global_psnr_v, global_psnr_all, number_of_frames); if (show_name) { printf("\t%s", argv[fileindex_rec + cur_rec]); } printf("\n"); } if (!quiet) { printf("Avg:"); if (do_psnr) { printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_psnr->y, cur_distortion_psnr->u, cur_distortion_psnr->v, cur_distortion_psnr->all, number_of_frames); } if (do_ssim) { printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_ssim->y, cur_distortion_ssim->u, cur_distortion_ssim->v, cur_distortion_ssim->all, number_of_frames); } if (show_name) { printf("\t%s", argv[fileindex_rec + cur_rec]); } printf("\n"); } if (!quiet) { printf("Min:"); if (do_psnr) { printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_psnr->min_y, cur_distortion_psnr->min_u, cur_distortion_psnr->min_v, cur_distortion_psnr->min_all, cur_distortion_psnr->min_frame); } if (do_ssim) { printf("\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", cur_distortion_ssim->min_y, cur_distortion_ssim->min_u, cur_distortion_ssim->min_v, cur_distortion_ssim->min_all, cur_distortion_ssim->min_frame); } if (show_name) { printf("\t%s", argv[fileindex_rec + cur_rec]); } printf("\n"); } if (do_mse) { double global_mse_y = GetMSE(cur_distortion_psnr->global_y, static_cast(y_size) * number_of_frames); double global_mse_u = GetMSE(cur_distortion_psnr->global_u, static_cast(uv_size) * number_of_frames); double global_mse_v = GetMSE(cur_distortion_psnr->global_v, static_cast(uv_size) * number_of_frames); double global_mse_all = GetMSE(cur_distortion_psnr->global_all, static_cast(total_size) * number_of_frames); printf("MSE:\t%10.6f\t%10.6f\t%10.6f\t%10.6f\t%5d", global_mse_y, global_mse_u, global_mse_v, global_mse_all, number_of_frames); if (show_name) { printf("\t%s", argv[fileindex_rec + cur_rec]); } printf("\n"); } } fclose(file_org); for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { fclose(file_rec[cur_rec]); } delete[] distortion_psnr; delete[] distortion_ssim; delete[] ch_org; delete[] ch_rec; delete[] file_rec; return 0; } libyuv-0.0~git20220104.b91df1a/util/ssim.cc000066400000000000000000000314071416500237200177720ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include "../util/ssim.h" // NOLINT #include #ifdef __cplusplus extern "C" { #endif typedef unsigned int uint32_t; // NOLINT typedef unsigned short uint16_t; // NOLINT #if !defined(LIBYUV_DISABLE_X86) && !defined(__SSE2__) && \ (defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 2))) #define __SSE2__ #endif #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) #include #endif #ifdef _OPENMP #include #endif // SSIM enum { KERNEL = 3, KERNEL_SIZE = 2 * KERNEL + 1 }; // Symmetric Gaussian kernel: K[i] = ~11 * exp(-0.3 * i * i) // The maximum value (11 x 11) must be less than 128 to avoid sign // problems during the calls to _mm_mullo_epi16(). static const int K[KERNEL_SIZE] = { 1, 3, 7, 11, 7, 3, 1 // ~11 * exp(-0.3 * i * i) }; static const double kiW[KERNEL + 1 + 1] = { 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] 1. / 1089., // 1 / sum(i:0..6, j..6) K[i]*K[j] 1. / 1056., // 1 / sum(i:0..5, j..6) K[i]*K[j] 1. / 957., // 1 / sum(i:0..4, j..6) K[i]*K[j] 1. / 726., // 1 / sum(i:0..3, j..6) K[i]*K[j] }; #if !defined(LIBYUV_DISABLE_X86) && defined(__SSE2__) #define PWEIGHT(A, B) static_cast(K[(A)] * K[(B)]) // weight product #define MAKE_WEIGHT(L) \ { \ { \ { \ PWEIGHT(L, 0) \ , PWEIGHT(L, 1), PWEIGHT(L, 2), PWEIGHT(L, 3), PWEIGHT(L, 4), \ PWEIGHT(L, 5), PWEIGHT(L, 6), 0 \ } \ } \ } // We need this union trick to be able to initialize constant static __m128i // values. We can't call _mm_set_epi16() for static compile-time initialization. static const struct { union { uint16_t i16_[8]; __m128i m_; } values_; } W0 = MAKE_WEIGHT(0), W1 = MAKE_WEIGHT(1), W2 = MAKE_WEIGHT(2), W3 = MAKE_WEIGHT(3); // ... the rest is symmetric. #undef MAKE_WEIGHT #undef PWEIGHT #endif // Common final expression for SSIM, once the weighted sums are known. static double FinalizeSSIM(double iw, double xm, double ym, double xxm, double xym, double yym) { const double iwx = xm * iw; const double iwy = ym * iw; double sxx = xxm * iw - iwx * iwx; double syy = yym * iw - iwy * iwy; // small errors are possible, due to rounding. Clamp to zero. if (sxx < 0.) { sxx = 0.; } if (syy < 0.) { syy = 0.; } const double sxsy = sqrt(sxx * syy); const double sxy = xym * iw - iwx * iwy; static const double C11 = (0.01 * 0.01) * (255 * 255); static const double C22 = (0.03 * 0.03) * (255 * 255); static const double C33 = (0.015 * 0.015) * (255 * 255); const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); const double c = (2. * sxsy + C22) / (sxx + syy + C22); const double s = (sxy + C33) / (sxsy + C33); return l * c * s; } // GetSSIM() does clipping. GetSSIMFullKernel() does not // TODO(skal): use summed tables? // Note: worst case of accumulation is a weight of 33 = 11 + 2 * (7 + 3 + 1) // with a diff of 255, squared. The maximum error is thus 0x4388241, // which fits into 32 bits integers. double GetSSIM(const uint8_t* org, const uint8_t* rec, int xo, int yo, int W, int H, int stride) { uint32_t ws = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; org += (yo - KERNEL) * stride; org += (xo - KERNEL); rec += (yo - KERNEL) * stride; rec += (xo - KERNEL); for (int y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride, rec += stride) { if (((yo - KERNEL + y_) < 0) || ((yo - KERNEL + y_) >= H)) { continue; } const int Wy = K[y_]; for (int x_ = 0; x_ < KERNEL_SIZE; ++x_) { const int Wxy = Wy * K[x_]; if (((xo - KERNEL + x_) >= 0) && ((xo - KERNEL + x_) < W)) { const int org_x = org[x_]; const int rec_x = rec[x_]; ws += Wxy; xm += Wxy * org_x; ym += Wxy * rec_x; xxm += Wxy * org_x * org_x; xym += Wxy * org_x * rec_x; yym += Wxy * rec_x * rec_x; } } } return FinalizeSSIM(1. / ws, xm, ym, xxm, xym, yym); } double GetSSIMFullKernel(const uint8_t* org, const uint8_t* rec, int xo, int yo, int stride, double area_weight) { uint32_t xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; #if defined(LIBYUV_DISABLE_X86) || !defined(__SSE2__) org += yo * stride + xo; rec += yo * stride + xo; for (int y = 1; y <= KERNEL; y++) { const int dy1 = y * stride; const int dy2 = y * stride; const int Wy = K[KERNEL + y]; for (int x = 1; x <= KERNEL; x++) { // Compute the contributions of upper-left (ul), upper-right (ur) // lower-left (ll) and lower-right (lr) points (see the diagram below). // Symmetric Kernel will have same weight on those points. // - - - - - - - // - ul - - - ur - // - - - - - - - // - - - 0 - - - // - - - - - - - // - ll - - - lr - // - - - - - - - const int Wxy = Wy * K[KERNEL + x]; const int ul1 = org[-dy1 - x]; const int ur1 = org[-dy1 + x]; const int ll1 = org[dy1 - x]; const int lr1 = org[dy1 + x]; const int ul2 = rec[-dy2 - x]; const int ur2 = rec[-dy2 + x]; const int ll2 = rec[dy2 - x]; const int lr2 = rec[dy2 + x]; xm += Wxy * (ul1 + ur1 + ll1 + lr1); ym += Wxy * (ul2 + ur2 + ll2 + lr2); xxm += Wxy * (ul1 * ul1 + ur1 * ur1 + ll1 * ll1 + lr1 * lr1); xym += Wxy * (ul1 * ul2 + ur1 * ur2 + ll1 * ll2 + lr1 * lr2); yym += Wxy * (ul2 * ul2 + ur2 * ur2 + ll2 * ll2 + lr2 * lr2); } // Compute the contributions of up (u), down (d), left (l) and right (r) // points across the main axes (see the diagram below). // Symmetric Kernel will have same weight on those points. // - - - - - - - // - - - u - - - // - - - - - - - // - l - 0 - r - // - - - - - - - // - - - d - - - // - - - - - - - const int Wxy = Wy * K[KERNEL]; const int u1 = org[-dy1]; const int d1 = org[dy1]; const int l1 = org[-y]; const int r1 = org[y]; const int u2 = rec[-dy2]; const int d2 = rec[dy2]; const int l2 = rec[-y]; const int r2 = rec[y]; xm += Wxy * (u1 + d1 + l1 + r1); ym += Wxy * (u2 + d2 + l2 + r2); xxm += Wxy * (u1 * u1 + d1 * d1 + l1 * l1 + r1 * r1); xym += Wxy * (u1 * u2 + d1 * d2 + l1 * l2 + r1 * r2); yym += Wxy * (u2 * u2 + d2 * d2 + l2 * l2 + r2 * r2); } // Lastly the contribution of (x0, y0) point. const int Wxy = K[KERNEL] * K[KERNEL]; const int s1 = org[0]; const int s2 = rec[0]; xm += Wxy * s1; ym += Wxy * s2; xxm += Wxy * s1 * s1; xym += Wxy * s1 * s2; yym += Wxy * s2 * s2; #else // __SSE2__ org += (yo - KERNEL) * stride + (xo - KERNEL); rec += (yo - KERNEL) * stride + (xo - KERNEL); const __m128i zero = _mm_setzero_si128(); __m128i x = zero; __m128i y = zero; __m128i xx = zero; __m128i xy = zero; __m128i yy = zero; // Read 8 pixels at line #L, and convert to 16bit, perform weighting // and acccumulate. #define LOAD_LINE_PAIR(L, WEIGHT) \ do { \ const __m128i v0 = \ _mm_loadl_epi64(reinterpret_cast(org + (L)*stride)); \ const __m128i v1 = \ _mm_loadl_epi64(reinterpret_cast(rec + (L)*stride)); \ const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ } while (0) #define ADD_AND_STORE_FOUR_EPI32(M, OUT) \ do { \ uint32_t tmp[4]; \ _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), (M)); \ (OUT) = tmp[3] + tmp[2] + tmp[1] + tmp[0]; \ } while (0) LOAD_LINE_PAIR(0, W0); LOAD_LINE_PAIR(1, W1); LOAD_LINE_PAIR(2, W2); LOAD_LINE_PAIR(3, W3); LOAD_LINE_PAIR(4, W2); LOAD_LINE_PAIR(5, W1); LOAD_LINE_PAIR(6, W0); ADD_AND_STORE_FOUR_EPI32(x, xm); ADD_AND_STORE_FOUR_EPI32(y, ym); ADD_AND_STORE_FOUR_EPI32(xx, xxm); ADD_AND_STORE_FOUR_EPI32(xy, xym); ADD_AND_STORE_FOUR_EPI32(yy, yym); #undef LOAD_LINE_PAIR #undef ADD_AND_STORE_FOUR_EPI32 #endif return FinalizeSSIM(area_weight, xm, ym, xxm, xym, yym); } static int start_max(int x, int y) { return (x > y) ? x : y; } double CalcSSIM(const uint8_t* org, const uint8_t* rec, const int image_width, const int image_height) { double SSIM = 0.; const int KERNEL_Y = (image_height < KERNEL) ? image_height : KERNEL; const int KERNEL_X = (image_width < KERNEL) ? image_width : KERNEL; const int start_x = start_max(image_width - 8 + KERNEL_X, KERNEL_X); const int start_y = start_max(image_height - KERNEL_Y, KERNEL_Y); const int stride = image_width; for (int j = 0; j < KERNEL_Y; ++j) { for (int i = 0; i < image_width; ++i) { SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); } } #ifdef _OPENMP #pragma omp parallel for reduction(+ : SSIM) #endif for (int j = KERNEL_Y; j < image_height - KERNEL_Y; ++j) { for (int i = 0; i < KERNEL_X; ++i) { SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); } for (int i = KERNEL_X; i < start_x; ++i) { SSIM += GetSSIMFullKernel(org, rec, i, j, stride, kiW[0]); } if (start_x < image_width) { // GetSSIMFullKernel() needs to be able to read 8 pixels (in SSE2). So we // copy the 8 rightmost pixels on a cache area, and pad this area with // zeros which won't contribute to the overall SSIM value (but we need // to pass the correct normalizing constant!). By using this cache, we can // still call GetSSIMFullKernel() instead of the slower GetSSIM(). // NOTE: we could use similar method for the left-most pixels too. const int kScratchWidth = 8; const int kScratchStride = kScratchWidth + KERNEL + 1; uint8_t scratch_org[KERNEL_SIZE * kScratchStride] = {0}; uint8_t scratch_rec[KERNEL_SIZE * kScratchStride] = {0}; for (int k = 0; k < KERNEL_SIZE; ++k) { const int offset = (j - KERNEL + k) * stride + image_width - kScratchWidth; memcpy(scratch_org + k * kScratchStride, org + offset, kScratchWidth); memcpy(scratch_rec + k * kScratchStride, rec + offset, kScratchWidth); } for (int k = 0; k <= KERNEL_X + 1; ++k) { SSIM += GetSSIMFullKernel(scratch_org, scratch_rec, KERNEL + k, KERNEL, kScratchStride, kiW[k]); } } } for (int j = start_y; j < image_height; ++j) { for (int i = 0; i < image_width; ++i) { SSIM += GetSSIM(org, rec, i, j, image_width, image_height, stride); } } return SSIM; } double CalcLSSIM(double ssim) { return -10.0 * log10(1.0 - ssim); } #ifdef __cplusplus } // extern "C" #endif libyuv-0.0~git20220104.b91df1a/util/ssim.h000066400000000000000000000017211416500237200176300ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ // Get SSIM for video sequence. Assuming RAW 4:2:0 Y:Cb:Cr format #ifndef UTIL_SSIM_H_ #define UTIL_SSIM_H_ #include // For log10() #ifdef __cplusplus extern "C" { #endif #if !defined(INT_TYPES_DEFINED) && !defined(UINT8_TYPE_DEFINED) typedef unsigned char uint8_t; #define UINT8_TYPE_DEFINED #endif double CalcSSIM(const uint8_t* org, const uint8_t* rec, const int image_width, const int image_height); double CalcLSSIM(double ssim); #ifdef __cplusplus } // extern "C" #endif #endif // UTIL_SSIM_H_ libyuv-0.0~git20220104.b91df1a/util/yuvconstants.c000066400000000000000000000072111416500237200214300ustar00rootroot00000000000000/* * Copyright 2021 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ #include #include #include #include // This utility computes values needed to generate yuvconstants based on // white point values. // The yuv formulas are tuned for 8 bit YUV channels. // See Also // https://mymusing.co/bt601-yuv-to-rgb-conversion-color/ // BT.709 full range YUV to RGB reference // R = Y + V * 1.5748 // G = Y - U * 0.18732 - V * 0.46812 // B = Y + U * 1.8556 // KR = 0.2126 // KB = 0.0722 // // Y contribution to R,G,B. Scale and bias. // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ // #define YB 32 /* 64 / 2 */ // // // U and V contributions to R,G,B. // #define UB 113 /* round(1.77200 * 64) */ // #define UG 22 /* round(0.34414 * 64) */ // #define VG 46 /* round(0.71414 * 64) */ // #define VR 90 /* round(1.40200 * 64) */ // // // Bias values to round, and subtract 128 from U and V. // #define BB (-UB * 128 + YB) // #define BG (UG * 128 + VG * 128 + YB) // #define BR (-VR * 128 + YB) int main(int argc, const char* argv[]) { if (argc < 2) { printf("yuvconstants Kr Kb\n"); printf(" MC BT KR = 0.2126; KB = 0.0722\n"); printf(" 1 BT.709 KR = 0.2126; KB = 0.0722\n"); printf(" 4 FCC KR = 0.30; KB = 0.11\n"); printf(" 6 BT.601 KR = 0.299; KB = 0.114\n"); printf(" 7 SMPTE 240M KR = 0.212; KB = 0.087\n"); printf(" 9 BT.2020 KR = 0.2627; KB = 0.0593\n"); return -1; } float kr = atof(argv[1]); float kb = atof(argv[2]); float kg = 1 - kr - kb; float vr = 2 * (1 - kr); float ug = 2 * ((1 - kb) * kb / kg); float vg = 2 * ((1 - kr) * kr / kg); float ub = 2 * (1 - kb); printf("Full range\n"); printf("R = Y + V * %5f\n", vr); printf("G = Y - U * %6f - V * %6f\n", ug, vg); printf("B = Y + U * %5f\n", ub); printf("KR = %4f; ", kr); printf("KB = %4f\n", kb); // printf("KG = %4f\n", kg); // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ // #define YB 32 /* 64 / 2 */ // // // U and V contributions to R,G,B. printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64); printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64); printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64); printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64); vr = 255.f / 224.f * 2 * (1 - kr); ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg); vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg); ub = 255.f / 224.f * 2 * (1 - kb); printf("\nLimited range\n"); printf("R = (Y - 16) * 1.164 + V * %5f\n", vr); printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg); printf("B = (Y - 16) * 1.164 + U * %5f\n", ub); // printf("KG = %4f\n", kg); // #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ // #define YB 32 /* 64 / 2 */ // // // U and V contributions to R,G,B. printf("UB %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ub * 64), ub, ub * 64); printf("UG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(ug * 64), ug, ug * 64); printf("VG %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vg * 64), vg, vg * 64); printf("VR %-3.0f /* round(%f * 64 = %8.4f) */\n", round(vr * 64), vr, vr * 64); return 0; } libyuv-0.0~git20220104.b91df1a/util/yuvconvert.cc000066400000000000000000000307001416500237200212360ustar00rootroot00000000000000/* * Copyright 2013 The LibYuv Project Authors. All rights reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ // Convert an ARGB image to YUV. // Usage: yuvconvert src_argb.raw dst_yuv.raw #ifndef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include #include "libyuv/convert.h" #include "libyuv/planar_functions.h" #include "libyuv/scale_argb.h" // options bool verbose = false; bool attenuate = false; bool unattenuate = false; int image_width = 0, image_height = 0; // original width and height int dst_width = 0, dst_height = 0; // new width and height int fileindex_org = 0; // argv argument contains the original file name. int fileindex_rec = 0; // argv argument contains the reconstructed file name. int num_rec = 0; // Number of reconstructed images. int num_skip_org = 0; // Number of frames to skip in original. int num_frames = 0; // Number of frames to convert. int filter = 1; // Bilinear filter for scaling. static __inline uint32_t Abs(int32_t v) { return v >= 0 ? v : -v; } // Parse PYUV format. ie name.1920x800_24Hz_P420.yuv bool ExtractResolutionFromFilename(const char* name, int* width_ptr, int* height_ptr) { // Isolate the .width_height. section of the filename by searching for a // dot or underscore followed by a digit. for (int i = 0; name[i]; ++i) { if ((name[i] == '.' || name[i] == '_') && name[i + 1] >= '0' && name[i + 1] <= '9') { int n = sscanf(name + i + 1, "%dx%d", width_ptr, height_ptr); // NOLINT if (2 == n) { return true; } } } return false; } void PrintHelp(const char* program) { printf("%s [-options] src_argb.raw dst_yuv.raw\n", program); printf( " -s .... specify source resolution. " "Optional if name contains\n" " resolution (ie. " "name.1920x800_24Hz_P420.yuv)\n" " Negative value mirrors.\n"); printf(" -d .... specify destination resolution.\n"); printf(" -f ............ 0 = point, 1 = bilinear (default).\n"); printf(" -skip ....... Number of frame to skip of src_argb\n"); printf(" -frames .......... Number of frames to convert\n"); printf(" -attenuate ............. Attenuate the ARGB image\n"); printf(" -unattenuate ........... Unattenuate the ARGB image\n"); printf(" -v ..................... verbose\n"); printf(" -h ..................... this help\n"); exit(0); } void ParseOptions(int argc, const char* argv[]) { if (argc <= 1) { PrintHelp(argv[0]); } for (int c = 1; c < argc; ++c) { if (!strcmp(argv[c], "-v")) { verbose = true; } else if (!strcmp(argv[c], "-attenuate")) { attenuate = true; } else if (!strcmp(argv[c], "-unattenuate")) { unattenuate = true; } else if (!strcmp(argv[c], "-h") || !strcmp(argv[c], "-help")) { PrintHelp(argv[0]); } else if (!strcmp(argv[c], "-s") && c + 2 < argc) { image_width = atoi(argv[++c]); // NOLINT image_height = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-d") && c + 2 < argc) { dst_width = atoi(argv[++c]); // NOLINT dst_height = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-skip") && c + 1 < argc) { num_skip_org = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-frames") && c + 1 < argc) { num_frames = atoi(argv[++c]); // NOLINT } else if (!strcmp(argv[c], "-f") && c + 1 < argc) { filter = atoi(argv[++c]); // NOLINT } else if (argv[c][0] == '-') { fprintf(stderr, "Unknown option. %s\n", argv[c]); } else if (fileindex_org == 0) { fileindex_org = c; } else if (fileindex_rec == 0) { fileindex_rec = c; num_rec = 1; } else { ++num_rec; } } if (fileindex_org == 0 || fileindex_rec == 0) { fprintf(stderr, "Missing filenames\n"); PrintHelp(argv[0]); } if (num_skip_org < 0) { fprintf(stderr, "Skipped frames incorrect\n"); PrintHelp(argv[0]); } if (num_frames < 0) { fprintf(stderr, "Number of frames incorrect\n"); PrintHelp(argv[0]); } int org_width, org_height; int rec_width, rec_height; bool org_res_avail = ExtractResolutionFromFilename(argv[fileindex_org], &org_width, &org_height); bool rec_res_avail = ExtractResolutionFromFilename(argv[fileindex_rec], &rec_width, &rec_height); if (image_width == 0 || image_height == 0) { if (org_res_avail) { image_width = org_width; image_height = org_height; } else if (rec_res_avail) { image_width = rec_width; image_height = rec_height; } else { fprintf(stderr, "Missing dimensions.\n"); PrintHelp(argv[0]); } } if (dst_width == 0 || dst_height == 0) { if (rec_res_avail) { dst_width = rec_width; dst_height = rec_height; } else { dst_width = Abs(image_width); dst_height = Abs(image_height); } } } static const int kTileX = 32; static const int kTileY = 32; static int TileARGBScale(const uint8_t* src_argb, int src_stride_argb, int src_width, int src_height, uint8_t* dst_argb, int dst_stride_argb, int destination_width, int destination_height, libyuv::FilterMode filtering) { for (int y = 0; y < destination_height; y += kTileY) { for (int x = 0; x < destination_width; x += kTileX) { int clip_width = kTileX; if (x + clip_width > destination_width) { clip_width = destination_width - x; } int clip_height = kTileY; if (y + clip_height > destination_height) { clip_height = destination_height - y; } int r = libyuv::ARGBScaleClip(src_argb, src_stride_argb, src_width, src_height, dst_argb, dst_stride_argb, destination_width, destination_height, x, y, clip_width, clip_height, filtering); if (r) { return r; } } } return 0; } int main(int argc, const char* argv[]) { ParseOptions(argc, argv); // Open original file (first file argument) FILE* const file_org = fopen(argv[fileindex_org], "rb"); if (file_org == NULL) { fprintf(stderr, "Cannot open %s\n", argv[fileindex_org]); exit(1); } // Open all files to convert to FILE** file_rec = new FILE*[num_rec]; memset(file_rec, 0, num_rec * sizeof(FILE*)); // NOLINT for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { file_rec[cur_rec] = fopen(argv[fileindex_rec + cur_rec], "wb"); if (file_rec[cur_rec] == NULL) { fprintf(stderr, "Cannot open %s\n", argv[fileindex_rec + cur_rec]); fclose(file_org); for (int i = 0; i < cur_rec; ++i) { fclose(file_rec[i]); } delete[] file_rec; exit(1); } } bool org_is_yuv = strstr(argv[fileindex_org], "_P420.") != NULL; bool org_is_argb = strstr(argv[fileindex_org], "_ARGB.") != NULL; if (!org_is_yuv && !org_is_argb) { fprintf(stderr, "Original format unknown %s\n", argv[fileindex_org]); exit(1); } int org_size = Abs(image_width) * Abs(image_height) * 4; // ARGB // Input is YUV if (org_is_yuv) { const int y_size = Abs(image_width) * Abs(image_height); const int uv_size = ((Abs(image_width) + 1) / 2) * ((Abs(image_height) + 1) / 2); org_size = y_size + 2 * uv_size; // YUV original. } const int dst_size = dst_width * dst_height * 4; // ARGB scaled const int y_size = dst_width * dst_height; const int uv_size = ((dst_width + 1) / 2) * ((dst_height + 1) / 2); const size_t total_size = y_size + 2 * uv_size; #if defined(_MSC_VER) _fseeki64(file_org, static_cast<__int64>(num_skip_org) * static_cast<__int64>(org_size), SEEK_SET); #else fseek(file_org, num_skip_org * total_size, SEEK_SET); #endif uint8_t* const ch_org = new uint8_t[org_size]; uint8_t* const ch_dst = new uint8_t[dst_size]; uint8_t* const ch_rec = new uint8_t[total_size]; if (ch_org == NULL || ch_rec == NULL) { fprintf(stderr, "No memory available\n"); fclose(file_org); for (int i = 0; i < num_rec; ++i) { fclose(file_rec[i]); } delete[] ch_org; delete[] ch_dst; delete[] ch_rec; delete[] file_rec; exit(1); } if (verbose) { printf("Size: %dx%d to %dx%d\n", image_width, image_height, dst_width, dst_height); } int number_of_frames; for (number_of_frames = 0;; ++number_of_frames) { if (num_frames && number_of_frames >= num_frames) { break; } // Load original YUV or ARGB frame. size_t bytes_org = fread(ch_org, sizeof(uint8_t), static_cast(org_size), file_org); if (bytes_org < static_cast(org_size)) { break; } // TODO(fbarchard): Attenuate doesnt need to know dimensions. // ARGB attenuate frame if (org_is_argb && attenuate) { libyuv::ARGBAttenuate(ch_org, 0, ch_org, 0, org_size / 4, 1); } // ARGB unattenuate frame if (org_is_argb && unattenuate) { libyuv::ARGBUnattenuate(ch_org, 0, ch_org, 0, org_size / 4, 1); } for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { // Scale YUV or ARGB frame. if (org_is_yuv) { int src_width = Abs(image_width); int src_height = Abs(image_height); int half_src_width = (src_width + 1) / 2; int half_src_height = (src_height + 1) / 2; int half_dst_width = (dst_width + 1) / 2; int half_dst_height = (dst_height + 1) / 2; I420Scale( ch_org, src_width, ch_org + src_width * src_height, half_src_width, ch_org + src_width * src_height + half_src_width * half_src_height, half_src_width, image_width, image_height, ch_rec, dst_width, ch_rec + dst_width * dst_height, half_dst_width, ch_rec + dst_width * dst_height + half_dst_width * half_dst_height, half_dst_width, dst_width, dst_height, static_cast(filter)); } else { TileARGBScale(ch_org, Abs(image_width) * 4, image_width, image_height, ch_dst, dst_width * 4, dst_width, dst_height, static_cast(filter)); } bool rec_is_yuv = strstr(argv[fileindex_rec + cur_rec], "_P420.") != NULL; bool rec_is_argb = strstr(argv[fileindex_rec + cur_rec], "_ARGB.") != NULL; if (!rec_is_yuv && !rec_is_argb) { fprintf(stderr, "Output format unknown %s\n", argv[fileindex_rec + cur_rec]); continue; // Advance to next file. } // Convert ARGB to YUV. if (!org_is_yuv && rec_is_yuv) { int half_width = (dst_width + 1) / 2; int half_height = (dst_height + 1) / 2; libyuv::ARGBToI420( ch_dst, dst_width * 4, ch_rec, dst_width, ch_rec + dst_width * dst_height, half_width, ch_rec + dst_width * dst_height + half_width * half_height, half_width, dst_width, dst_height); } // Output YUV or ARGB frame. if (rec_is_yuv) { size_t bytes_rec = fwrite(ch_rec, sizeof(uint8_t), static_cast(total_size), file_rec[cur_rec]); if (bytes_rec < static_cast(total_size)) { break; } } else { size_t bytes_rec = fwrite(ch_dst, sizeof(uint8_t), static_cast(dst_size), file_rec[cur_rec]); if (bytes_rec < static_cast(dst_size)) { break; } } if (verbose) { printf("%5d", number_of_frames); } if (verbose) { printf("\t%s", argv[fileindex_rec + cur_rec]); printf("\n"); } } } fclose(file_org); for (int cur_rec = 0; cur_rec < num_rec; ++cur_rec) { fclose(file_rec[cur_rec]); } delete[] ch_org; delete[] ch_dst; delete[] ch_rec; delete[] file_rec; return 0; } libyuv-0.0~git20220104.b91df1a/winarm.mk000066400000000000000000000021311416500237200173510ustar00rootroot00000000000000# This is a generic makefile for libyuv for Windows Arm. # call "c:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_arm\vcvarsx86_arm.bat" # nmake /f winarm.mk # make -f winarm.mk # nmake /f winarm.mk clean # consider /arch:ARMv7VE CC=cl CCFLAGS=/Ox /nologo /Iinclude /DWINAPI_FAMILY=WINAPI_FAMILY_PHONE_APP AR=lib ARFLAGS=/MACHINE:ARM /NOLOGO /SUBSYSTEM:NATIVE RM=cmd /c del LOCAL_OBJ_FILES = \ source/compare.o\ source/compare_common.o\ source/convert.o\ source/convert_argb.o\ source/convert_from.o\ source/convert_from_argb.o\ source/convert_to_argb.o\ source/convert_to_i420.o\ source/cpu_id.o\ source/planar_functions.o\ source/rotate.o\ source/rotate_any.o\ source/rotate_argb.o\ source/rotate_common.o\ source/row_any.o\ source/row_common.o\ source/scale.o\ source/scale_any.o\ source/scale_argb.o\ source/scale_common.o\ source/scale_uv.o\ source/video_common.o .cc.o: $(CC) /c $(CCFLAGS) $*.cc /Fo$@ all: libyuv_arm.lib winarm.mk libyuv_arm.lib: $(LOCAL_OBJ_FILES) winarm.mk $(AR) $(ARFLAGS) /OUT:$@ $(LOCAL_OBJ_FILES) clean: $(RM) "source\*.o" libyuv_arm.lib