pax_global_header00006660000000000000000000000064151743511710014517gustar00rootroot0000000000000052 comment=80c23624ce008d937da7e845e528e82ce0cbf4e0 xtensor-stack-xsimd-541558d/000077500000000000000000000000001517435117100157275ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/.clang-format000066400000000000000000000002301517435117100202750ustar00rootroot00000000000000--- BasedOnStyle: WebKit AlignAfterOpenBracket: Align AlignConsecutiveDeclarations: 'false' BreakBeforeBraces: Allman NamespaceIndentation: All ... xtensor-stack-xsimd-541558d/.clang-tidy000066400000000000000000000001371517435117100177640ustar00rootroot00000000000000--- Checks: '-*,modernize-type-traits' WarningsAsErrors: true HeaderFilterRegex: '.*' xtensor-stack-xsimd-541558d/.github/000077500000000000000000000000001517435117100172675ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/.github/cmake-test/000077500000000000000000000000001517435117100213245ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/.github/cmake-test/CMakeLists.txt000066400000000000000000000002471517435117100240670ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.19) project(test VERSION 0.0.1) find_package(xsimd REQUIRED) add_executable(test main.cpp) target_link_libraries(test PUBLIC xsimd) xtensor-stack-xsimd-541558d/.github/cmake-test/main.cpp000066400000000000000000000000711517435117100227520ustar00rootroot00000000000000#include "xsimd/xsimd.hpp" int main() { return 0; } xtensor-stack-xsimd-541558d/.github/toolchains/000077500000000000000000000000001517435117100214325ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/.github/toolchains/clang-aarch64-linux-gnu.cmake000066400000000000000000000001621517435117100266710ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR aarch64) set(triple aarch64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake) xtensor-stack-xsimd-541558d/.github/toolchains/clang-arm-linux-gnueabihf.cmake000066400000000000000000000001641517435117100273610ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR armv7-a) set(triple arm-linux-gnueabihf) include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake) xtensor-stack-xsimd-541558d/.github/toolchains/clang-riscv64-linux-gnu.cmake000066400000000000000000000001621517435117100267410ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR riscv64) set(triple riscv64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake) xtensor-stack-xsimd-541558d/.github/toolchains/clang.cmake000066400000000000000000000005341517435117100235220ustar00rootroot00000000000000set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_C_COMPILER clang) set(CMAKE_C_COMPILER_TARGET ${triple}) set(CMAKE_CXX_COMPILER clang++) set(CMAKE_CXX_COMPILER_TARGET ${triple}) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) xtensor-stack-xsimd-541558d/.github/toolchains/gcc-aarch64-linux-gnu.cmake000066400000000000000000000001601517435117100263370ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR aarch64) set(triple aarch64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xtensor-stack-xsimd-541558d/.github/toolchains/gcc-arm-linux-gnueabihf.cmake000066400000000000000000000001621517435117100270270ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR armv7-a) set(triple arm-linux-gnueabihf) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xtensor-stack-xsimd-541558d/.github/toolchains/gcc-powerpc64-linux-gnu.cmake000066400000000000000000000001651517435117100267450ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR powerpc64) set(triple powerpc64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xtensor-stack-xsimd-541558d/.github/toolchains/gcc-powerpc64le-linux-gnu.cmake000066400000000000000000000001711517435117100272630ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR powerpc64le) set(triple powerpc64le-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xtensor-stack-xsimd-541558d/.github/toolchains/gcc-riscv64-linux-gnu.cmake000066400000000000000000000001601517435117100264070ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR riscv64) set(triple riscv64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xtensor-stack-xsimd-541558d/.github/toolchains/gcc-s390x-linux-gnu.cmake000066400000000000000000000001541517435117100260000ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR s390x) set(triple s390x-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xtensor-stack-xsimd-541558d/.github/toolchains/gcc.cmake000066400000000000000000000004321517435117100231670ustar00rootroot00000000000000set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_C_COMPILER ${triple}-gcc) set(CMAKE_CXX_COMPILER ${triple}-g++) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) xtensor-stack-xsimd-541558d/.github/workflows/000077500000000000000000000000001517435117100213245ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/.github/workflows/android.yml000066400000000000000000000020511517435117100234650ustar00rootroot00000000000000name: Android build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: test: runs-on: ubuntu-latest strategy: matrix: target: - armeabi-v7a - arm64-v8a - x86 - x86_64 api: - 16 - 18 steps: - name: Checkout uses: actions/checkout@v6 - name: Build script env: TARGET: ${{ matrix.target }} API: ${{ matrix.api }} run: | NDK="$($ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager --list_installed | sed -E 's/( +[|] +)/|/g;s/ +$//' | grep '^ ndk' | cut -d '|' -f 4 | sort | head -n1)" cmake -B _build \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/$NDK/build/cmake/android.toolchain.cmake \ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \ -DANDROID_ABI=$ABI -DANDROID_PLATFORM=android-$API \ -DCMAKE_BUILD_TYPE=Release cmake --build _build --verbose xtensor-stack-xsimd-541558d/.github/workflows/arch-consistency-check.yml000066400000000000000000000006631517435117100264030ustar00rootroot00000000000000name: Arch consistency check on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest steps: - name: Checkout xsimd uses: actions/checkout@v6 - name: Install dependencies run: sudo apt install g++ - name: Check architecture consistency run: cd test && sh ./check_arch.sh xtensor-stack-xsimd-541558d/.github/workflows/benchmark.yml000066400000000000000000000007541517435117100240070ustar00rootroot00000000000000name: benchmark & examples on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Setup run: cmake -B _build -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release - name: Build run: cmake --build _build - name: Testing sequential run: cmake --build _build --target xbenchmark xtensor-stack-xsimd-541558d/.github/workflows/cmake.yml000066400000000000000000000014131517435117100231260ustar00rootroot00000000000000name: CMake integration on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true defaults: run: shell: bash -l {0} jobs: build: runs-on: ubuntu-latest steps: - name: Checkout xsimd uses: actions/checkout@v6 - name: Configure build run: cmake -B _build -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=_install - name: Build run: cmake --build _build --target install - name: Check install run: | mkdir _install_build && cd _install_build cp ${{ github.workspace }}/.github/cmake-test/* . ls $PWD/../_install/share/cmake/xsimd cmake . -DCMAKE_PREFIX_PATH=$PWD/../_install/share/cmake/xsimd cmake --build . xtensor-stack-xsimd-541558d/.github/workflows/cross-arm.yml000066400000000000000000000052521517435117100237610ustar00rootroot00000000000000name: Arm cross-compilation build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest name: '${{ matrix.target.arch }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}' strategy: matrix: target: - { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=neon', full: 'ON'} - { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=vfpv3-d16', full: 'OFF' } # no neon - { platform: 'aarch64', arch: 'armv8-a', dir: 'aarch64-linux-gnu', flags: '', full: 'ON' } sys: - { compiler: 'gcc', version: '10' } - { compiler: 'gcc', version: '14' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} run: | sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib || exit 1 sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20 sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20 - name: Setup QEMU run: | sudo apt-get --no-install-suggests --no-install-recommends install qemu-user - name: Setup Ninja run: | sudo apt-get install ninja-build - name: Checkout xsimd uses: actions/checkout@v6 - name: Setup run: | cmake -B _build \ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} \ -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release \ -DTARGET_ARCH=generic \ -DCMAKE_C_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" \ -DCMAKE_CXX_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" \ -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake - name: Build run: cmake --build _build - name: Testing xsimd run: qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd working-directory: ${{ github.workspace }}/_build xtensor-stack-xsimd-541558d/.github/workflows/cross-ppc.yml000066400000000000000000000054021517435117100237610ustar00rootroot00000000000000name: PowerPC cross-compilation build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest name: '${{ matrix.target.platform }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}' strategy: matrix: target: - { platform: 'ppc64le', dir: 'powerpc64le-linux-gnu', flags: '-maltivec -mvsx -mcpu=power10', full: 'OFF' } - { platform: 'ppc64', dir: 'powerpc64-linux-gnu', flags: '-maltivec -mvsx -mcpu=power10', full: 'OFF' } sys: - { compiler: 'gcc', version: '12' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} run: | sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib || exit 1 sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20 sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20 - name: Setup QEMU run: | sudo apt-get --no-install-suggests --no-install-recommends install qemu-user - name: Setup Ninja run: | sudo apt-get install ninja-build - name: Checkout xsimd uses: actions/checkout@v6 - name: Setup run: | cmake -B build/ \ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \ -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" \ -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" \ -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake - name: Build run: cmake --build build/ --verbose -j1 - name: Set CPU feature test expectations run: | - name: Testing xsimd run: | # Set CPU feature test expectations, 0 is explicit absence of the feature export XSIMD_TEST_CPU_ASSUME_SSE4_2="0" export XSIMD_TEST_CPU_ASSUME_NEON64="0" export XSIMD_TEST_CPU_ASSUME_RVV="0" export XSIMD_TEST_CPU_ASSUME_VSX="1" qemu-${{ matrix.target.platform }} -cpu power10 -L /usr/${{ matrix.target.dir}}/ ./build/test/test_xsimd xtensor-stack-xsimd-541558d/.github/workflows/cross-rvv.yml000066400000000000000000000055621517435117100240230ustar00rootroot00000000000000name: RISC-V RVV cross-compilation build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest name: 'RISC-V RVV${{ matrix.vector_bits }}' strategy: matrix: sys: - { compiler: 'gcc', gcc_runtime: '14'} - { compiler: 'clang', version: '17', gcc_runtime: '14'} - { compiler: 'clang', version: '18', gcc_runtime: '14'} vector_bits: - 128 - 256 - 512 steps: - name: Setup GCC run: | sudo apt-get -y -qq update sudo apt-get -y -qq --no-install-suggests --no-install-recommends install gcc-${{ matrix.sys.gcc_runtime }}-riscv64-linux-gnu g++-${{ matrix.sys.gcc_runtime }}-riscv64-linux-gnu sudo update-alternatives --install /usr/bin/riscv64-linux-gnu-gcc riscv64-linux-gnu-gcc /usr/bin/riscv64-linux-gnu-gcc-${{ matrix.sys.gcc_runtime }} 20 sudo update-alternatives --install /usr/bin/riscv64-linux-gnu-g++ riscv64-linux-gnu-g++ /usr/bin/riscv64-linux-gnu-g++-${{ matrix.sys.gcc_runtime }} 20 - name: Setup LLVM if: ${{ matrix.sys.compiler == 'clang' }} run: | # Install given LLVM version curl -o llvm.sh https://apt.llvm.org/llvm.sh chmod u+x llvm.sh sudo ./llvm.sh ${{ matrix.sys.version }} sudo ln -srf $(which clang-${{ matrix.sys.version }}) /usr/bin/clang sudo ln -srf $(which clang++-${{ matrix.sys.version }}) /usr/bin/clang++ rm llvm.sh - name: Setup QEMU uses: docker/setup-qemu-action@v3.0.0 with: platforms: riscv64 - name: Setup Ninja run: | sudo apt-get -y -qq install ninja-build - name: Checkout xsimd uses: actions/checkout@v6 - name: Setup run: > cmake -S . -B _build -GNinja -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DTARGET_ARCH=generic -DCMAKE_C_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl" -DCMAKE_CXX_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-riscv64-linux-gnu.cmake - name: Build run: cmake --build _build - name: Set CPU feature test expectations run: | echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV" echo "XSIMD_TEST_CPU_ASSUME_SVE=0" >> "$GITHUB_ENV" echo "XSIMD_TEST_CPU_ASSUME_RVV=1" >> "$GITHUB_ENV" - name: Testing xsimd run: > QEMU_CPU="rv64,zba=true,zbb=true,zbs=true,v=true,vlen=${{ matrix.vector_bits }},elen=64,vext_spec=v1.0" QEMU_LD_PREFIX="/usr/riscv64-linux-gnu" ./test/test_xsimd working-directory: ${{ github.workspace }}/_build xtensor-stack-xsimd-541558d/.github/workflows/cross-s390x.yml000066400000000000000000000051051517435117100240650ustar00rootroot00000000000000name: IBM Z cross-compilation build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest name: '${{ matrix.target.platform }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}' strategy: matrix: target: - { platform: 's390x', dir: 's390x-linux-gnu', full: 'OFF' } sys: - { compiler: 'gcc', version: '14' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} run: | sudo apt-get update || exit 1 sudo apt-get -y --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib cmake || exit 1 sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20 sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20 - name: Setup QEMU run: | sudo apt-get --no-install-suggests --no-install-recommends install qemu-user - name: Setup Ninja run: | sudo apt-get install ninja-build - name: Checkout xsimd uses: actions/checkout@v6 - name: Setup run: | cmake -B build/ \ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \ -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" \ -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" \ -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake - name: Build run: cmake --build build/ --verbose -j1 - name: Testing xsimd run: | # Set CPU feature test expectations, 0 is explicit absence of the feature export XSIMD_TEST_CPU_ASSUME_SSE4_2="0" export XSIMD_TEST_CPU_ASSUME_NEON64="0" export XSIMD_TEST_CPU_ASSUME_RVV="0" export XSIMD_TEST_CPU_ASSUME_VSX="0" export XSIMD_TEST_CPU_ASSUME_VXE="1" qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./build/test/test_xsimd xtensor-stack-xsimd-541558d/.github/workflows/cross-sve.yml000066400000000000000000000042611517435117100237760ustar00rootroot00000000000000name: Arm-SVE cross-compilation build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest name: 'Arm SVE${{ matrix.vector_bits }}' strategy: matrix: vector_bits: - 128 - 256 - 512 steps: - name: Setup compiler run: | sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++-10-aarch64-linux-gnu || exit 1 sudo update-alternatives --install /usr/bin/aarch64-linux-gnu-gcc aarch64-linux-gnu-gcc /usr/bin/aarch64-linux-gnu-gcc-10 20 sudo update-alternatives --install /usr/bin/aarch64-linux-gnu-g++ aarch64-linux-gnu-g++ /usr/bin/aarch64-linux-gnu-g++-10 20 - name: Setup QEMU run: | sudo apt-get --no-install-suggests --no-install-recommends install qemu-user - name: Setup Ninja run: | sudo apt-get install ninja-build - name: Checkout xsimd uses: actions/checkout@v6 - name: Setup run: | cmake -B _build \ -GNinja \ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \ -DCMAKE_BUILD_TYPE=Release \ -DTARGET_ARCH=generic \ -DCMAKE_C_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" \ -DCMAKE_CXX_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" \ -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/gcc-aarch64-linux-gnu.cmake - name: Build run: cmake --build _build - name: Set CPU feature test expectations run: | echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV" echo "XSIMD_TEST_CPU_ASSUME_RVV=0" >> "$GITHUB_ENV" echo "XSIMD_TEST_CPU_ASSUME_NEON64=1" >> "$GITHUB_ENV" echo "XSIMD_TEST_CPU_ASSUME_SVE=1" >> "$GITHUB_ENV" echo "XSIMD_TEST_CPU_ASSUME_MANUFACTURER=unknown" >> "$GITHUB_ENV" - name: Testing xsimd run: qemu-aarch64 --cpu max,sve${{ matrix.vector_bits }}=on -L /usr/aarch64-linux-gnu/ ./test/test_xsimd working-directory: ${{ github.workspace }}/_build xtensor-stack-xsimd-541558d/.github/workflows/cxx-no-exceptions.yml000066400000000000000000000005261517435117100254450ustar00rootroot00000000000000name: C++ -fno-except compatibility on: [push, pull_request] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Setup run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-fno-exceptions - name: Build run: cmake --build _build xtensor-stack-xsimd-541558d/.github/workflows/cxx-versions.yml000066400000000000000000000010051517435117100245130ustar00rootroot00000000000000name: C++ compatibility build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest strategy: matrix: cxx-version: [14, 17, 20] steps: - uses: actions/checkout@v6 - name: Setup run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${{matrix.cxx-version}} - name: Build run: cmake --build _build xtensor-stack-xsimd-541558d/.github/workflows/doxygen.yml000066400000000000000000000005751517435117100235330ustar00rootroot00000000000000name: doc on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Install dependencies run: sudo apt install doxygen python3-breathe python3-sphinx-rtd-theme - name: Render run: make -C docs xtensor-stack-xsimd-541558d/.github/workflows/emscripten.yml000066400000000000000000000012461517435117100242230ustar00rootroot00000000000000name: Emscripten build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: test: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v6 - uses: mamba-org/setup-micromamba@v2 with: environment-name: xsimd create-args: >- microsoft::playwright=1.50.0 python init-shell: bash - name: Build script shell: bash -el {0} run: | echo "Build script for wasm" playwright install ./test/test_wasm/test_wasm.sh 4.0.21 xtensor-stack-xsimd-541558d/.github/workflows/emulated.yml000066400000000000000000000025541517435117100236550ustar00rootroot00000000000000name: Linux emulated build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true defaults: run: shell: bash -l {0} jobs: build: runs-on: ubuntu-latest name: '${{ matrix.sys.compiler }} - emulated<${{ matrix.sys.size }}>' strategy: matrix: sys: - { compiler: 'g++', size: '128'} - { compiler: 'g++', size: '256'} - { compiler: 'g++', size: '512'} steps: - name: Checkout xsimd uses: actions/checkout@v6 - name: Install mamba uses: mamba-org/setup-micromamba@v1 with: environment-file: environment.yml - name: Setup GCC compiler if: ${{ matrix.sys.compiler == 'g++' }} run: echo "CXXFLAGS=-Wno-noexcept-type -Wno-stringop-overflow -Wno-maybe-uninitialized" >> $GITHUB_ENV - name: Configure build run: | cmake -B _build \ -DBUILD_TESTS=ON \ -DBUILD_BENCHMARK=ON \ -DBUILD_EXAMPLES=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER=${{ matrix.sys.compiler }} \ -DXSIMD_ENABLE_WERROR=ON \ -DTARGET_ARCH="emulated<${{ matrix.sys.size }}>" \ -DCMAKE_CXX_FLAGS="${CXXFLAGS}" \ -GNinja - name: Build run: ninja -C _build - name: Test run: ninja -C _build xtest xtensor-stack-xsimd-541558d/.github/workflows/linux.yml000066400000000000000000000136661517435117100232220ustar00rootroot00000000000000name: Linux x86 build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true defaults: run: shell: bash -l {0} jobs: build: runs-on: ubuntu-latest name: '${{ matrix.sys.compiler }} ${{ matrix.sys.version }} - ${{ matrix.sys.flags }}' strategy: matrix: sys: - { compiler: 'gcc', version: '12', flags: 'force_no_instr_set' } - { compiler: 'gcc', version: '13', flags: 'enable_xtl_complex' } - { compiler: 'gcc', version: '14', flags: 'avx' } - { compiler: 'gcc', version: '13', flags: 'avx512' } - { compiler: 'gcc', version: '10', flags: 'avx512' } - { compiler: 'gcc', version: '12', flags: 'i386' } - { compiler: 'gcc', version: '13', flags: 'avx512pf' } - { compiler: 'gcc', version: '13', flags: 'avx512vbmi' } - { compiler: 'gcc', version: '14', flags: 'avx512vbmi2' } - { compiler: 'gcc', version: '13', flags: 'avx512vnni' } - { compiler: 'clang', version: '16', flags: 'force_no_instr_set' } - { compiler: 'clang', version: '16', flags: 'enable_xtl_complex' } - { compiler: 'clang', version: '17', flags: 'avx' } - { compiler: 'clang', version: '17', flags: 'sse3' } - { compiler: 'clang', version: '18', flags: 'avx512' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} run: | GCC_VERSION=${{ matrix.sys.version }} sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install g++-$GCC_VERSION sudo dpkg --add-architecture i386 sudo add-apt-repository ppa:ubuntu-toolchain-r/test sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install gcc-$GCC_VERSION-multilib g++-$GCC_VERSION-multilib linux-libc-dev:i386 CC=gcc-$GCC_VERSION echo "CC=$CC" >> $GITHUB_ENV CXX=g++-$GCC_VERSION echo "CXX=$CXX" >> $GITHUB_ENV - name: Setup compiler if: ${{ matrix.sys.compiler == 'clang' }} run: | LLVM_VERSION=${{ matrix.sys.version }} sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++ g++-multilib || exit 1 sudo ln -s /usr/include/asm-generic /usr/include/asm CC=clang-$LLVM_VERSION echo "CC=$CC" >> $GITHUB_ENV CXX=clang++-$LLVM_VERSION echo "CXX=$CXX" >> $GITHUB_ENV - name: Checkout xsimd uses: actions/checkout@v6 - name: Install mamba uses: mamba-org/setup-micromamba@v2 with: environment-file: environment.yml - name: Setup SDE if: startswith(matrix.sys.flags, 'avx512') run: sh install_sde.sh - name: Configure build env: CC: ${{ env.CC }} CXX: ${{ env.CXX }} run: | if [[ '${{ matrix.sys.flags }}' == 'enable_xtl_complex' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DENABLE_XTL_COMPLEX=ON" fi if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge" fi if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona" fi if [[ '${{ matrix.sys.flags }}' == 'avx512' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" fi if [[ '${{ matrix.sys.flags }}' == 'avx512pf' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knl" fi if [[ '${{ matrix.sys.flags }}' == 'avx512vbmi' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=cannonlake" fi if [[ '${{ matrix.sys.flags }}' == 'avx512vbmi2' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=icelake-server" fi if [[ '${{ matrix.sys.flags }}' == 'avx512vnni' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knm" fi if [[ '${{ matrix.sys.flags }}' == 'i386' ]]; then CXX_FLAGS="$CXX_FLAGS -m32" fi if [[ '${{ matrix.sys.flags }}' == 'force_no_instr_set' ]]; then : else CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DXSIMD_ENABLE_WERROR=ON" fi # Cheap way of spotting uninitialized read CXX_FLAGS="$CXX_FLAGS -ftrivial-auto-var-init=pattern" cmake -B _build \ -DBUILD_TESTS=ON \ -DBUILD_BENCHMARK=ON \ -DBUILD_EXAMPLES=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=$CC \ -DCMAKE_CXX_COMPILER=$CXX \ $CMAKE_EXTRA_ARGS \ -DCMAKE_CXX_FLAGS='$CXX_FLAGS' \ -G Ninja - name: Build run: cmake --build _build - name: Test run: | # Set CPU feature test expectations, 0 is explicit absence of the feature export XSIMD_TEST_CPU_ASSUME_NEON64="0" export XSIMD_TEST_CPU_ASSUME_RVV="0" export XSIMD_TEST_CPU_ASSUME_VSX="0" export XSIMD_TEST_CPU_ASSUME_VXE="0" cd _build/test if echo '${{ matrix.sys.flags }}' | grep -q 'avx512' ; then # Running with emulation, must have AVX512, lower tier are checked by implications in tests export XSIMD_TEST_CPU_ASSUME_AVX512F="1" ../../sde-external-9.48.0-2024-11-25-lin/sde64 -tgl -- ./test_xsimd else export XSIMD_TEST_CPU_ASSUME_SSE4_2=$(grep -q 'sse4_2' /proc/cpuinfo && echo "1" || echo "0") export XSIMD_TEST_CPU_ASSUME_AVX=$(grep -q 'avx' /proc/cpuinfo && echo "1" || echo "0") export XSIMD_TEST_CPU_ASSUME_AVX512F=$(grep -q 'avx512f' /proc/cpuinfo && echo "1" || echo "0") export XSIMD_TEST_CPU_ASSUME_MANUFACTURER="intel,amd" ./test_xsimd fi xtensor-stack-xsimd-541558d/.github/workflows/macos.yml000066400000000000000000000026041517435117100231530ustar00rootroot00000000000000name: macOS build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: strategy: matrix: os: - 14 - 15 - 15-intel runs-on: macos-${{ matrix.os }} name: 'macos-${{ matrix.os }}' steps: - uses: actions/checkout@v6 - name: Setup run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" - name: Build run: cmake --build _build --verbose - name: Testing sequential run: cmake --build _build --target xbenchmark --verbose - name: Set CPU feature test expectations run: | echo "XSIMD_TEST_CPU_ASSUME_RVV=0" >> "$GITHUB_ENV" if echo '${{ matrix.os }}' | grep -q intel; then echo "XSIMD_TEST_CPU_ASSUME_NEON64=0" >> "$GITHUB_ENV" echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=1" >> "$GITHUB_ENV" echo "XSIMD_TEST_CPU_ASSUME_MANUFACTURER=intel" >> "$GITHUB_ENV" else echo "XSIMD_TEST_CPU_ASSUME_NEON64=1" >> "$GITHUB_ENV" echo "XSIMD_TEST_CPU_ASSUME_SSE4_2=0" >> "$GITHUB_ENV" echo "XSIMD_TEST_CPU_ASSUME_MANUFACTURER=unknown" >> "$GITHUB_ENV" fi - name: Testing xsimd run: ${{github.workspace}}/_build/test/test_xsimd xtensor-stack-xsimd-541558d/.github/workflows/sanitizer.yml000066400000000000000000000023211517435117100240550ustar00rootroot00000000000000name: sanitizer on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true defaults: run: shell: bash -l {0} jobs: build: runs-on: ubuntu-latest name: 'sanitizer - ${{ matrix.flags }}' strategy: matrix: flags: - sanitize=address - sanitize=undefined - fast-math -fsanitize=undefined llvm-version: [20] env: CC: clang-${{ matrix.llvm-version }} CXX: clang++-${{ matrix.llvm-version }} steps: - name: Checkout xsimd uses: actions/checkout@v6 - name: Setup compiler run: | wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh sudo ./llvm.sh ${{ matrix.llvm-version }} - name: Configure build run: | cmake -B_build \ -DBUILD_TESTS=ON \ -DBUILD_BENCHMARK=ON \ -DBUILD_EXAMPLES=ON \ -DDOWNLOAD_DOCTEST=ON \ -DCMAKE_BUILD_TYPE=Debug \ -DCMAKE_CXX_FLAGS='-f${{ matrix.flags }} -O0 -g -fno-inline' \ -G Ninja - name: Build run: cmake --build _build - name: Test run: ./_build/test/test_xsimd xtensor-stack-xsimd-541558d/.github/workflows/style-check.yml000066400000000000000000000027061517435117100242670ustar00rootroot00000000000000name: style check on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: formatting-check: name: Format check runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - run: sudo apt install clang-format - run: | git fetch origin ${{ github.event.pull_request.base.sha }} git clang-format --diff ${{ github.event.pull_request.base.sha }} | tee diff.patch ! grep -q '^diff ' diff.patch inlining-check: runs-on: ubuntu-latest name: Check inline keyword usage steps: - uses: actions/checkout@v6 - run: sudo apt install clang-tools - run: sh ./test/check_inline_specifier.sh . include-check: runs-on: ubuntu-latest name: Check unused standard includes steps: - uses: actions/checkout@v6 - run: pip install diskarzhan - run: diskarzhan `find -name '*.[ch]pp'` clang-tidy-check: name: Clang-tidy check (x86_64) runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - run: sudo apt install clang-tidy - name: Configure run: cmake -B _build -DCMAKE_CXX_COMPILER=clang++ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Debug -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_CXX_FLAGS='-march=tigerlake' . - name: Check run: run-clang-tidy -p _build xtensor-stack-xsimd-541558d/.github/workflows/windows.yml000066400000000000000000000122401517435117100235400ustar00rootroot00000000000000name: Windows build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build-windows-x86: name: 'MSVC ${{ matrix.os }}, ${{ matrix.target }} ${{ matrix.sys.set }}' defaults: run: shell: bash {0} strategy: matrix: os: - 2022 - 2025 target: - x86 - x64 sys: - { set: SSE, flags: "/arch:SSE2" } - { set: AVX, flags: "/arch:AVX" } - { set: AVX2, flags: "/arch:AVX2" } - { set: AVX512, flags: "/arch:AVX512" } exclude: # AVX on both platforms has a codegen error # On 2019 in _mm256_rsqrt_ps, on 2022 in _mm256_blend_p* - { sys: { set: AVX } } # On both platforms x86 + AVX512 triggers a compiler crash - { target: x86, sys: { set: AVX512 } } # /arch:SSE2 is not available on x64 platforms (SSE2 is enabled by default) - { target: x64, sys: { set: SSE} } runs-on: windows-${{ matrix.os }} steps: - name: Setup compiler uses: ilammy/msvc-dev-cmd@v1 with: arch: ${{ matrix.target }} - name: Setup Ninja run: | python3 -m pip install --upgrade pip setuptools wheel python3 -m pip install ninja - name: Checkout xsimd uses: actions/checkout@v6 - name: Setup run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="${{ matrix.sys.flags }}" -G Ninja - name: Build run: cmake --build _build - name: Testing xsimd if: ${{ !startsWith(matrix.sys.set, 'AVX512') }} env: # Set CPU feature test expectations # Assuming the runner always has AVX2 (independent of compilation option) XSIMD_TEST_CPU_ASSUME_NEON64: "0" XSIMD_TEST_CPU_ASSUME_SSE4_2: "1" XSIMD_TEST_CPU_ASSUME_AVX2: "1" XSIMD_TEST_CPU_ASSUME_MANUFACTURER: "intel,amd" run: ./_build/test/test_xsimd build-windows-mingw: name: 'MSYS2 ${{ matrix.msystem }}' runs-on: windows-2022 defaults: run: shell: msys2 {0} strategy: matrix: # Temporarily remove MINGW64 and UCRT64 builds because # GCC 12 gives an unexpected overflow warning for __builtin_memmove # see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106199 msystem: [ MINGW32, CLANG64 ] #msystem: [ MINGW32, MINGW64, UCRT64, CLANG32, CLANG64 ] fail-fast: false steps: - name: Use MinGW from MSYS2 uses: msys2/setup-msys2@v2 with: msystem: ${{ matrix.msystem }} update: true path-type: minimal pacboy: >- cc:p cmake:p ninja:p - name: Checkout xsimd uses: actions/checkout@v6 - name: Configure run: cmake -B _build -DBUILD_TESTS=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DDOWNLOAD_DOCTEST=ON -G Ninja - name: Build run: cmake --build _build - name: Test run: ./_build/test/test_xsimd build-windows-clang-cl: name: 'clang-cl x64 ${{ matrix.config.name }}' defaults: run: shell: bash {0} strategy: matrix: config: - { name: "AVX2", flags: "/arch:AVX2", benchmark: "ON", examples: "ON" } - { name: "/fp:fast", flags: "/fp:fast", benchmark: "OFF", examples: "OFF" } runs-on: windows-2025 steps: - name: Setup compiler uses: ilammy/msvc-dev-cmd@v1 with: arch: amd64 - name: Check clang-cl run: | command -v clang-cl clang-cl --version - name: Setup Ninja run: | python3 -m pip install --upgrade pip setuptools wheel python3 -m pip install ninja - name: Checkout xsimd uses: actions/checkout@v3 - name: Setup run: | cmake -B _build \ -DBUILD_TESTS=ON \ -DDOWNLOAD_DOCTEST=ON \ -DBUILD_BENCHMARK=${{ matrix.config.benchmark }} \ -DBUILD_EXAMPLES=${{ matrix.config.examples }} \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=clang-cl \ -DCMAKE_CXX_COMPILER=clang-cl \ -DCMAKE_CXX_FLAGS="${{ matrix.config.flags }} -DXSIMD_REASSOCIATIVE_MATH=1" \ -G Ninja - name: Build run: cmake --build _build - name: Testing xsimd run: ./_build/test/test_xsimd build-windows-arm64: name: 'MSVC arm64' defaults: run: shell: bash {0} runs-on: windows-11-arm steps: - name: Setup compiler uses: ilammy/msvc-dev-cmd@v1 with: arch: arm64 - name: Setup Ninja run: | python3 -m pip install --upgrade pip setuptools wheel python3 -m pip install ninja - name: Checkout xsimd uses: actions/checkout@v6 - name: Setup run: cmake -B _build -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -G Ninja - name: Build run: cmake --build _build - name: Testing xsimd run: ./_build/test/test_xsimd xtensor-stack-xsimd-541558d/.gitignore000066400000000000000000000007251517435117100177230ustar00rootroot00000000000000# Generated pkg-config files *.pc # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # Vim tmp files *.swp # Build folder build/ # Documentation build artefacts docs/CMakeCache.txt docs/xml/ docs/build/ # VSCode / clangd IntelliSense .vscode/ .cache/ # CLion / IDEA .idea/xtensor-stack-xsimd-541558d/CMakeLists.txt000066400000000000000000000103761517435117100204760ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # Copyright (c) Serge Guelton # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.13) project(xsimd) # Versioning # ========== file(STRINGS "include/xsimd/config/xsimd_config.hpp" xsimd_version_defines REGEX "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH)") foreach(ver ${xsimd_version_defines}) if(ver MATCHES "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$") set(XSIMD_VERSION_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}" CACHE INTERNAL "") endif() endforeach() set(${PROJECT_NAME}_VERSION ${XSIMD_VERSION_MAJOR}.${XSIMD_VERSION_MINOR}.${XSIMD_VERSION_PATCH}) message(STATUS "xsimd v${${PROJECT_NAME}_VERSION}") # Build # ===== add_library(xsimd INTERFACE) add_library(xsimd::xsimd ALIAS xsimd) target_include_directories(xsimd INTERFACE $ $) OPTION(ENABLE_XTL_COMPLEX "enables support for xcomplex defined in xtl" OFF) OPTION(BUILD_TESTS "xsimd test suite" OFF) target_compile_features(xsimd INTERFACE cxx_std_14) if(ENABLE_XTL_COMPLEX) find_package(xtl 0.8.0 REQUIRED) target_compile_definitions(xsimd INTERFACE XSIMD_ENABLE_XTL_COMPLEX=1) target_link_libraries(xsimd INTERFACE xtl) endif() if(BUILD_TESTS) enable_testing() add_subdirectory(test) endif() OPTION(BUILD_BENCHMARK "xsimd benchmarks" OFF) if(BUILD_BENCHMARK) add_subdirectory(benchmark) endif() OPTION(BUILD_EXAMPLES "xsimd examples" OFF) if(BUILD_EXAMPLES) add_subdirectory(examples) endif() # Installation # ============ OPTION(XSIMD_SKIP_INSTALL "Skip installation or not. By default it is OFF" OFF) if(${XSIMD_SKIP_INSTALL}) return() # skip installation endif () include(GNUInstallDirs) include(CMakePackageConfigHelpers) install(TARGETS xsimd EXPORT ${PROJECT_NAME}-targets) # Makes the project importable from the build directory export(EXPORT ${PROJECT_NAME}-targets FILE "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Targets.cmake") install(DIRECTORY include/xsimd DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". set(XSIMD_CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}" CACHE STRING "install path for xsimdConfig.cmake") configure_package_config_file(${PROJECT_NAME}Config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" INSTALL_DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) # xsimd is header-only and does not depend on the architecture. # Remove CMAKE_SIZEOF_VOID_P from xtensorConfigVersion.cmake so that an xtensorConfig.cmake # generated for a 64 bit target can be used for 32 bit targets and vice versa. set(_XTENSOR_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P}) unset(CMAKE_SIZEOF_VOID_P) write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake VERSION ${${PROJECT_NAME}_VERSION} COMPATIBILITY SameMajorVersion) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) install(EXPORT ${PROJECT_NAME}-targets FILE ${PROJECT_NAME}Targets.cmake DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) configure_file(${PROJECT_NAME}.pc.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" @ONLY) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" DESTINATION "${CMAKE_INSTALL_DATADIR}/pkgconfig/") xtensor-stack-xsimd-541558d/CONTRIBUTING.md000066400000000000000000000016651517435117100201700ustar00rootroot00000000000000# Contributing to xsimd First, thanks for being there! Welcome on board, we will try to make your contributing journey as good an experience as it can be. # Submitting patches Patches should be submitted through Github PR. We did put some effort to setup a decent Continuous Integration coverage, please try to make it green ;-) We use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to keep the coding style consistent, a ``.clang-format`` file is shipped within the source, feel free to use it! # Extending the API We are open to extending the API, as long as it has been discussed either in an Issue or a PR. The only constraint is to add testing for new functions, and make sure they work on all supported architectures, not only your favorite one! # Licensing We use a shared copyright model that enables all contributors to maintain the copyright on their contributions. Stated otherwise, there's no copyright assignment. xtensor-stack-xsimd-541558d/Changelog.rst000066400000000000000000000317511517435117100203570ustar00rootroot00000000000000.. Copyright (c) Serge Guelton and Johan Mabille Copyright (c) QuantStack Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Changelog ========= 14.2.0 ------ * **New architecture**: IBM Z (s390x) support * [API] New cross-platform ``cpu_features`` API for querying CPU features available at runtime * [API] Add ``xsimd::get()`` for compile-time lane extraction * [API] Add ``xsimd::stream_load``, ``xsimd::stream_store``, and ``xsimd::fence`` for non-temporal memory transfers * [VSX] Fix dynamic dispatch support with runtime cpu feature inspection * [VSX] Fix rounding * [SVE/RVV] Fix dynamic dispatch by inspecting available vector length * [AVX2] Add native ``uint64``/``int64`` multiplication kernel * [NEON] Add support for Windows ARM * [NEON] Simplify static dispatch of intrinsicts * [NEON] Fix ``batch_bool`` store on ARM by replacing ``vst1_lane_u32`` with a full lane store followed by a memcpy * [SVE] Fix dynamic dispatch ODR violation * [ci] Fix emulated architecture interaction with AVX512 leading to CI failures. Provide a cmake-level configuration switch for emulated build * Fix build with compilers that do not support C++20 (even though we only require C++14) * Fix ``xsimd::signbit`` scalar overload leaking into non-scalar overload resolution * Fix complex batch load * Harden fast-math reassociation barriers * Publish the C++14 requirement through the CMake interface 14.1.0 ------ * Add popcnt and bmi * [API] Add bitwise-shift batch constant api * Refactor x86 CPU features * [NEON] Unsigned bitwise shifts are never called * Improve coverage of emulated architectures * Introduce `count{l,r}_{zero,one}` for `batch_bool` * Fix emulated mask() * [neon] Implement bitwise_rshift for 64 bit integers on arm32 * Fix fast_cast int64/uint64→double under -ffast-math * Small complexity reduction * Add make_batch_constant from std::array in C++20 * [ci] Use home-baked clang-format action * Fix apple detection * [ci] add GCC 10 with AVX-512 to test matrix * Slighly less pessimistic detection of neon64 * Fix runtime detection of SVE * [ci] Setup Windows arm64 runner * iota batch constant and a few overloads * [test] Improve testing logging and accuracy * Fix default values for AVX and AVX512 OS state enabled flags * Implement batch_bool::mask() for riscv * [ci] Revert emscripten to 4.0.21 * Restore RISCV support * Implement optimized movemasks for NEON * Fix limit behavior of atan2 under -ffast-math * Move to C++14 14.0.0 ------ * **New architecture**: VMX with VSX extension * [API] Add ``xsimd::bitwise_[l|r]shift(...)`` and ``xsimd::rot[l|r](...)`` * [API] Add ``xsimd::widen`` to widen a batch to a batch twice as big * [API] Add ``xsimd::first()`` function to extract the first lane from a batch * [API] Reorder ``xsimd::make_batch_constant`` and ``xsimd::make_batch_bool_constant`` template arguments * Bump CMake requirement to 3.10 * Provide generic and specialize implementation of ``xsimd::reduce_mul`` * Have ``xsimd::max`` / ``min`` behave as ``std::max`` / ``min`` when one argument is NaN * Optimize batch_bool load/store from/to array of booleans * Cleaner error when trying to instantiate a batch while no arch is supported * Fix ``XSIMD_INLINE`` for compilers that don't have always_inline * Rename ``xsimd::generic`` in ``xsimd::common`` * Fix ``xsimd::log10`` implementation under ``-ffast-math``, and add ``-fast-math-support`` to generic math algorithm and tests * Bump xtl dependency requirement * Provide a generic implementation of ``swizzle`` with constant mask * Enable xsimd with only emulated arch * Rename ``avx512vnni`` in ``avx512vnni`` * [SSE2] Fix and improve ``xsimd::swizzle`` on ``[u]int16`` * [AVX512x] Specialize ``xsimd::insert``, ``xsimd::incr_if``, ``xsimd::decr_if`` * [AVX512F,AVX512VBMI] Sepcialize ``xsimd::slide_left`` and ``xsimd::slide_right`` * [AVX512F] Fix ``batch_bool`` xor * [WASM] Fix neq for ``batch_bool`` * [AVX/AVX2/AVX512/ARM32] Improve implementation of ``xsimd::swizzle`` * [AVX512VBMI2] Speciliaze ``xsimd::compress`` and ``xsimd::expand`` * [SSE/AVX/AVX512] Improve ``xsimd::reduce_add`` * [SSSE3/AVX2] Fix ``xsimd::rotate_left`` implementation for ``[u]int16`` and optimize the ``[u]int8`` implementation * [AVX2] Fix implementation of ``xsimd::rotate_left`` * [AVX512] Disable faulty implementation of ``xsimd::rotate_left`` * [ARM64] Improve implementation of comparison operator for 64 bit integers * [AVX512BW] Optimize ``xsimd::shift_left`` and ``xsimd::shift_right`` * [AVX512F] Fix ``batch_const`` with 16b and 8b integers 13.2.0 ------ * Added broadcast overload for bool * Fixed kernel::store for booleans * Explicitly verify dependency between architectures (like sse2 implies sse2) * Use default arch alignment as default alignment for xsimd::aligned_allocator * sse2 version of xsimd::swizzle on [u]int16_t * avx implementation of transpose for [u]int[8|16] * Implement [u]int8 and [u]int16 matrix transpose for 128 bit registers * Fix minor warning * Fix fma4 support 13.1.0 ------ * Fix rotate_left and rotate_right behavior (it was swapped!) * Fix compress implementation on RISC-V * Improve RISC-V CI * Fix clang-17 compilation on RISC-V * Validate cmake integration * Provide xsimd::transpose on 64 and 32 bits on most platforms * Improve documentation * Provide xsimd::batch_bool::count * Fix interaction between xsimd::make_sized_batch_t and xsimd::batch * Fix vbmi, sve and rvv detection through xsimd::available_architectures * Fix compilation on MS targets where ``small`` can be defined. * Change default install directory for installed headers. * Support mixed-complex implementations of xsimd::pow() * Improve xsimd::pow implementation for complex numbers * Fix uninitialized read in lgamma implementation 13.0.0 ------ * Most xsimd functions are flagged as always_inline * Fix some xsimd scalar version (abs, bitofsign, signbit, bitwise_cast, exp10) * Move from batch_constant, Csts...> to batch_constant * Move from batch_bool_constant, Csts...> to batch_bool_constant * Provide an as_batch() method (resp. as_batch_bool) method for batch_constant (resp. batch_bool_constant) * New architecture emulated for batches of N bits emulated using scalar operations. * Remove the version method from all architectures * Support xsimd::avg and xsimd::avgr vector operation * Model i8mm arm extension * Fix dispatching mechanism 12.1.1 ------ * Update readme with a section on adoption, and a section on the history of the project * Fix/avx512vnni implementation * Fix regression on XSIMD_NO_SUPPORTED_ARCHITECTURE 12.1.0 ------ * Fix various problems with architecture version handling * Specialize xsimd::compress for riscv * Provide stubs for various avx512xx architectures 12.0.0 ------ * Fix sincos implementation to cope with Emscripten * Upgraded minimal version of cmake to remove deprecation warning * Fixed constants::signmask for GCC when using ffast-math * Add RISC-V Vector support * Generic, simple implementation fox xsimd::compress * Disable batch of bools, and suggest using batch_bool instead * Add an option to skip installation 11.2.0 ------ * Provide shuffle operations of floating point batches * Provide a generic implementation of xsimd::swizzle with dynamic indices * Implement rotl, rotr, rotate_left and rotate_right * Let CMake figure out pkgconfig directories * Add missing boolean operators in xsimd_api.hpp * Initial Implementation for the new WASM based instruction set * Provide a generic version for float to uint32_t conversion 11.1.0 ------ * Introduce XSIMD_DEFAULT_ARCH to force default architecture (if any) * Remove C++ requirement on xsimd::exp10 scalar implementation * Improve and test documentation 11.0.0 ------ * Provide a generic reducer * Fix ``find_package(xsimd)`` for xtl enabled xsimd, reloaded * Cleanup benchmark code * Provide avx512f implementation of FMA and variant * Hexadecimal floating points are not a C++11 feature * back to slow implementation of exp10 on Windows * Changed bitwise_cast API * Provide generic signed /unsigned type conversion * Fixed sde location * Feature/incr decr * Cleanup documentation 10.0.0 ------ * Fix potential ABI issue in SVE support * Disable fast exp10 on OSX * Assert on unaligned memory when calling aligned load/store * Fix warning about uninitialized storage * Always forward arch parameter * Do not specialize the behavior of ``simd_return_type`` for char * Support broadcasting of complex batches * Make xsimd compatible with -fno-exceptions * Provide and test comparison operators overloads that accept scalars 9.0.1 ----- * Fix potential ABI issue in SVE support, making ``xsimd::sve`` a type alias to size-dependent type. 9.0.0 ----- * Support fixed size SVE * Fix a bug in SSSE3 ``xsimd::swizzle`` implementation for ``int8`` and ``int16`` * Rename ``xsimd::hadd`` into ``xsimd::reduce_add``, provide ``xsimd::reduce_min`` and ``xsimd::reduce_max`` * Properly report unsupported double for neon on arm32 * Fill holes in xsimd scalar api * Fix ``find_package(xsimd)`` for xtl enabled xsimd * Replace ``xsimd::bool_cast`` by ``xsimd::batch_bool_cast`` * Native ``xsimd::hadd`` for float on arm64 * Properly static_assert when trying to instantiate an ``xsimd::batch`` of xtl complex * Introduce ``xsimd::batch_bool::mask()`` and ``batch_bool::from_mask(...)`` * Flag some function with ``[[nodiscard]]`` * Accept both relative and absolute libdir and include dir in xsimd.pc * Implement ``xsimd::nearbyint_as_int`` for NEON * Add ``xsimd::polar`` * Speedup double -> F32/I32 gathers * Add ``xsimd::slide_left`` and ``xsimd::slide_right`` * Support integral ``xsimd::swizzles`` on AVX 8.1.0 ----- * Add ``xsimd::gather`` and ``xsimd::scatter`` * Add ``xsimd::nearbyint_as_int`` * Add ``xsimd::none`` * Add ``xsimd::reciprocal`` * Remove batch constructor from memory adress, use ``xsimd::batch<...>::load_(un)aligned`` instead * Leave to msvc users the opportunity to manually disable FMA3 on AVX * Provide ``xsimd::insert`` to modify a single value from a vector * Make ``xsimd::pow`` implementation resilient to ``FE_INVALID`` * Reciprocal square root support through ``xsimd::rsqrt`` * NEON: Improve ``xsimd::any`` and ``xsimd::all`` * Provide type utility to explicitly require a batch of given size and type * Implement ``xsimd::swizzle`` on x86, neon and neon64 * Avx support for ``xsimd::zip_lo`` and ``xsimd::zip_hi`` * Only use ``_mm256_unpacklo_epi`` on AVX2 * Provide neon/neon64 conversion function from ``uint(32|64)_t`` to ``(float|double)`` * Provide SSE/AVX/AVX2 conversion function from ``uint32_t`` to ``float`` * Provide AVX2 conversion function from ``(u)int64_t`` to ``double`` * Provide better SSE conversion function from ``uint64_t`` to ``double`` * Provide better SSE conversion function to ``double`` * Support logical xor for ``xsimd::batch_bool`` * Clarify fma support: - FMA3 + SSE -> ``xsimd::fma3`` - FMA3 + AVX -> ``xsimd::fma3`` - FMA3 + AVX2 -> ``xsimd::fma3`` - FMA4 -> ``xsimd::fma4`` * Allow ``xsimd::transform`` to work with complex types * Add missing scalar version of ``xsimd::norm`` and ``xsimd::conj`` 8.0.5 ----- * Fix neon ``xsimd::hadd`` implementation * Detect unsupported architectures and set ``XSIMD_NO_SUPPORTED_ARCHITECTURE`` if needs be 8.0.4 ----- * Provide some conversion operators for ``float`` -> ``uint32`` * Improve code generated for AVX2 signed integer comparisons * Enable detection of avx512cd and avx512dq, and fix avx512bw detection * Enable detection of AVX2+FMA * Pick the best compatible architecture in ``xsimd::dispatch`` * Enables support for FMA when AVX2 is detected on Windows * Add missing includes / forward declaration * Mark all functions inline and noexcept * Assert when using incomplete ``std::initializer_list`` 8.0.3 ----- * Improve CI & testing, no functional change 8.0.2 ----- * Do not use ``_mm256_srai_epi32`` under AVX, it's an AVX2 instruction 8.0.1 ----- * Fix invalid constexpr ``std::make_tuple`` usage in neon64 xtensor-stack-xsimd-541558d/LICENSE000066400000000000000000000031061517435117100167340ustar00rootroot00000000000000Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou Copyright (c) 2016, QuantStack Copyright (c) 2018, Serge Guelton All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xtensor-stack-xsimd-541558d/README.md000066400000000000000000000210031517435117100172020ustar00rootroot00000000000000# ![xsimd](docs/source/xsimd.svg) [![GHA android](https://github.com/xtensor-stack/xsimd/actions/workflows/android.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/android.yml) [![GHA cross-rvv](https://github.com/xtensor-stack/xsimd/actions/workflows/cross-rvv.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/cross-rvv.yml) [![GHA cross-sve](https://github.com/xtensor-stack/xsimd/actions/workflows/cross-sve.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/cross-sve.yml) [![GHA cross](https://github.com/xtensor-stack/xsimd/actions/workflows/cross.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/cross.yml) [![GHA cxx-no-exceptions](https://github.com/xtensor-stack/xsimd/actions/workflows/cxx-no-exceptions.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/cxx-no-exceptions.yml) [![GHA cxx-versions](https://github.com/xtensor-stack/xsimd/actions/workflows/cxx-versions.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/cxx-versions.yml) [![GHA emscripten](https://github.com/xtensor-stack/xsimd/actions/workflows/emscripten.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/emscripten.yml) [![GHA linux](https://github.com/xtensor-stack/xsimd/actions/workflows/linux.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/linux.yml) [![GHA macos](https://github.com/xtensor-stack/xsimd/actions/workflows/macos.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/macos.yml) [![GHA windows](https://github.com/xtensor-stack/xsimd/actions/workflows/windows.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/windows.yml) [![Documentation Status](http://readthedocs.org/projects/xsimd/badge/?version=latest)](https://xsimd.readthedocs.io/en/latest/?badge=latest) [![Zulip](https://img.shields.io/badge/social_chat-zulip-blue.svg)](https://xtensor.zulipchat.com/#narrow/channel/539553-Ask-anything) C++ wrappers for SIMD intrinsics ## Introduction SIMD (Single Instruction, Multiple Data) is a feature of microprocessors that has been available for many years. SIMD instructions perform a single operation on a batch of values at once, and thus provide a way to significantly accelerate code execution. However, these instructions differ between microprocessor vendors and compilers. `xsimd` provides a unified means for using these features for library authors. Namely, it enables manipulation of batches of numbers with the same arithmetic operators as for single values. It also provides accelerated implementation of common mathematical functions operating on batches. ## Adoption Beyond Xtensor, Xsimd has been adopted by major open-source projects, such as Mozilla Firefox, Apache Arrow, Pythran, and Krita. ## History The XSimd project started with a series of blog articles by Johan Mabille on how to implement wrappers for SIMD intrinsicts. The archives of the blog can be found here: [The C++ Scientist](http://johanmabille.github.io/blog/archives/). The design described in the articles remained close to the actual architecture of XSimd up until Version 8.0. The mathematical functions are a lightweight implementation of the algorithms originally implemented in the now deprecated [boost.SIMD](https://github.com/NumScale/boost.simd) project. ## Requirements `xsimd` requires a C++14 compliant compiler. The following C++ compilers are supported: Compiler | Version ------------------------|------------------------------- Microsoft Visual Studio | MSVC 2015 update 2 and above g++ | 4.9 and above clang | 4.0 and above The following SIMD instruction set extensions are supported: Architecture | Instruction set extensions -------------|----------------------------------------------------- x86 | SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, FMA3+SSE, FMA3+AVX, FMA3+AVX2 x86 | AVX512BW, AVX512CD, AVX512DQ, AVX512F (gcc7 and higher) x86 AMD | FMA4 ARM | NEON, NEON64, SVE128/256/512 (fixed vector size) WebAssembly | WASM powerpc64 | VSX RISC-V | RISC-V128/256/512 (fixed vector size) IBM Z (s390x)| VXE (IBM z14) ## Installation ### Install from conda-forge A package for xsimd is available on the mamba (or conda) package manager. ```bash mamba install -c conda-forge xsimd ``` ### Install with Spack A package for xsimd is available on the Spack package manager. ```bash spack install xsimd spack load xsimd ``` ### Install from sources You can directly install it from the sources with cmake: ```bash cmake -D CMAKE_INSTALL_PREFIX=your_install_prefix . make install ``` ## Documentation To get started with using `xsimd`, check out the full documentation http://xsimd.readthedocs.io/ ## Dependencies `xsimd` has an optional dependency on the [xtl](https://github.com/xtensor-stack/xtl) library: | `xsimd` | `xtl` (optional) | |---------|------------------| | master | ^0.7.0 | | 12.x | ^0.7.0 | | 11.x | ^0.7.0 | | 10.x | ^0.7.0 | | 9.x | ^0.7.0 | | 8.x | ^0.7.0 | The dependency on `xtl` is required if you want to support vectorization for `xtl::xcomplex`. ## Usage The version 8 of the library is a complete rewrite and there are some slight differences with 7.x versions. A migration guide will be available soon. In the meanwhile, the following examples show how to use both versions 7 and 8 of the library? ### Explicit use of an instruction set extension Here is an example that computes the mean of two sets of 4 double floating point values, assuming AVX extension is supported: ```cpp #include #include "xsimd/xsimd.hpp" namespace xs = xsimd; int main(int argc, char* argv[]) { xs::batch a = {1.5, 2.5, 3.5, 4.5}; xs::batch b = {2.5, 3.5, 4.5, 5.5}; auto mean = (a + b) / 2; std::cout << mean << std::endl; return 0; } ``` Do not forget to enable AVX extension when building the example. With gcc or clang, this is done with the `-mavx` flag, on MSVC you have to pass the `/arch:AVX` option. This example outputs: ```cpp (2.0, 3.0, 4.0, 5.0) ``` ### Auto detection of the instruction set extension to be used The same computation operating on vectors and using the most performant instruction set available at compile time, based on the provided compiler flags (e.g. ``-mavx2`` for GCC and Clang to target AVX2): ```cpp #include #include #include "xsimd/xsimd.hpp" namespace xs = xsimd; using vector_type = std::vector>; void mean(const vector_type& a, const vector_type& b, vector_type& res) { std::size_t size = a.size(); constexpr std::size_t simd_size = xsimd::simd_type::size; std::size_t vec_size = size - size % simd_size; for(std::size_t i = 0; i < vec_size; i += simd_size) { auto ba = xs::load_aligned(&a[i]); auto bb = xs::load_aligned(&b[i]); auto bres = (ba + bb) / 2.; bres.store_aligned(&res[i]); } for(std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2.; } } ``` ## Building and Running the Tests Building the tests requires [cmake](https://cmake.org). `cmake` is available as a package for most linux distributions. Besides, they can also be installed with the `conda` package manager (even on windows): ```bash conda install -c conda-forge cmake ``` Once `cmake` is installed, you can build and run the tests: ```bash mkdir build cd build cmake ../ -DBUILD_TESTS=ON make xtest ``` In the context of continuous integration with Travis CI, tests are run in a `conda` environment, which can be activated with ```bash cd test conda env create -f ./test-environment.yml source activate test-xsimd cd .. cmake . -DBUILD_TESTS=ON make xtest ``` ## Building the HTML Documentation xsimd's documentation is built with three tools - [doxygen](http://www.doxygen.org) - [sphinx](http://www.sphinx-doc.org) - [breathe](https://breathe.readthedocs.io) While doxygen must be installed separately, you can install breathe by typing ```bash pip install breathe ``` Breathe can also be installed with `conda` ```bash conda install -c conda-forge breathe ``` Finally, build the documentation with ```bash make html ``` from the `docs` subdirectory. ## License We use a shared copyright model that enables all contributors to maintain the copyright on their contributions. This software is licensed under the BSD-3-Clause license. See the [LICENSE](LICENSE) file for details. xtensor-stack-xsimd-541558d/benchmark/000077500000000000000000000000001517435117100176615ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/benchmark/CMakeLists.txt000066400000000000000000000060101517435117100224160ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # Copyright (c) Serge Guelton # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.13) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-benchmark) find_package(xsimd REQUIRED CONFIG) endif () if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting tests build type to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) else() message(STATUS "Tests build type is ${CMAKE_BUILD_TYPE}") endif() include(CheckCXXCompilerFlag) string(TOUPPER "${CMAKE_BUILD_TYPE}" U_CMAKE_BUILD_TYPE) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") endif() if(NOT MSVC) if (ENABLE_XTL_COMPLEX) CHECK_CXX_COMPILER_FLAG("-std=c++17" HAS_CPP17_FLAG) if (NOT HAS_CPP17_FLAG) message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++17 support when xtl complex support is enabled") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") else() CHECK_CXX_COMPILER_FLAG("-std=c++14" HAS_CPP14_FLAG) if (NOT HAS_CPP14_FLAG) message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++14 support!") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") endif() endif() endif() endif() if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /MP /bigobj") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") set(CMAKE_EXE_LINKER_FLAGS /MANIFEST:NO) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) string(REPLACE "/MD" "-MT" ${flag_var} "${${flag_var}}") endforeach() endif() set(XSIMD_BENCHMARK_SRC main.cpp xsimd_benchmark.hpp ) add_executable(benchmark_xsimd ${XSIMD_BENCHMARK_SRC}) target_link_libraries(benchmark_xsimd PRIVATE xsimd) if(ENABLE_XTL_COMPLEX) target_link_libraries(benchmark_xsimd PRIVATE xtl) endif() add_custom_target(xbenchmark COMMAND benchmark_xsimd DEPENDS benchmark_xsimd) xtensor-stack-xsimd-541558d/benchmark/main.cpp000066400000000000000000000155671517435117100213270ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include #include #include "xsimd_benchmark.hpp" void benchmark_operation() { // std::size_t size = 9984; std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::add_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::sub_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::mul_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::div_fn(), std::cout, size, 1000); } void benchmark_exp_log() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::exp_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::exp2_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::expm1_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log2_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::log10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log1p_fn(), std::cout, size, 1000); } void benchmark_trigo() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::sin_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cos_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::tan_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::asin_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); xsimd::run_benchmark_1op(xsimd::acos_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); xsimd::run_benchmark_1op(xsimd::atan_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); } void benchmark_hyperbolic() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::sinh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cosh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::tanh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::asinh_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::acosh_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::atanh_fn(), std::cout, size, 100); } void benchmark_power() { std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::pow_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::sqrt_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cbrt_fn(), std::cout, size, 100); xsimd::run_benchmark_2op(xsimd::hypot_fn(), std::cout, size, 1000); } void benchmark_rounding() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::ceil_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::floor_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::trunc_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::round_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::nearbyint_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::rint_fn(), std::cout, size, 100); } #ifdef XSIMD_POLY_BENCHMARKS void benchmark_poly_evaluation() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::horner_5_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_5_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_12_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_12_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_14_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_14_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_16_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_16_fn(), std::cout, size, 1000); } #endif void benchmark_basic_math() { std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::fmod_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::remainder_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::fdim_fn(), std::cout, size, 1000); xsimd::run_benchmark_3op(xsimd::clip_fn(), std::cout, size, 1000); #if 0 xsimd::run_benchmark_1op_pred(xsimd::isfinite_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::isinf_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_flint_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_odd_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_even_fn(), std::cout, size, 100); #endif } int main(int argc, char* argv[]) { const std::map> fn_map = { { "op", { "arithmetic", benchmark_operation } }, { "exp", { "exponential and logarithm", benchmark_exp_log } }, { "trigo", { "trigonometric", benchmark_trigo } }, { "hyperbolic", { "hyperbolic", benchmark_hyperbolic } }, { "power", { "power", benchmark_power } }, { "basic_math", { "basic math", benchmark_basic_math } }, { "rounding", { "rounding", benchmark_rounding } }, #ifdef XSIMD_POLY_BENCHMARKS { "utils", { "polynomial evaluation", benchmark_poly_evaluation } }, #endif }; if (argc > 1) { if (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h") { std::cout << "Available options:" << std::endl; for (auto const& kv : fn_map) { std::cout << kv.first << ": run benchmark on " << kv.second.first << " functions" << std::endl; } } else { std::cout << "############################" << std::endl << "# " << xsimd::default_arch::name() << std::endl << "############################" << std::endl; for (int i = 1; i < argc; ++i) { fn_map.at(argv[i]).second(); } } } else { std::cout << "############################" << std::endl << "# " << xsimd::default_arch::name() << std::endl << "############################" << std::endl; for (auto const& kv : fn_map) { kv.second.second(); } } return 0; } xtensor-stack-xsimd-541558d/benchmark/xsimd_benchmark.hpp000066400000000000000000000557531517435117100235470ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_BENCHMARK_HPP #define XSIMD_BENCHMARK_HPP #include "xsimd/arch/xsimd_scalar.hpp" #include "xsimd/xsimd.hpp" #include #include #include namespace xsimd { using duration_type = std::chrono::duration; template using bench_vector = std::vector>; template void init_benchmark(bench_vector& lhs, bench_vector& rhs, bench_vector& res, size_t size) { lhs.resize(size); rhs.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { lhs[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size); rhs[i] = T(10.2) / T(i + 2) + T(0.25); } } template void init_benchmark(bench_vector& op0, bench_vector& op1, bench_vector& op2, bench_vector& res, size_t size) { op0.resize(size); op1.resize(size); op2.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { op0[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size); op1[i] = T(10.2) / T(i + 3) + T(0.25); op2[i] = T(20.1) / T(i + 2) + T(0.65); } } template void init_benchmark_arctrigo(bench_vector& lhs, bench_vector& rhs, bench_vector& res, size_t size) { lhs.resize(size); rhs.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { lhs[i] = T(-1.) + T(2.) * T(i) / T(size); rhs[i] = T(i) / T(i + 2) + T(0.25); } } enum class init_method { classic, arctrigo }; template duration_type benchmark_scalar(F f, V& lhs, V& res, std::size_t number) { size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(lhs[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_scalar(F f, V& lhs, V& rhs, V& res, std::size_t number) { size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(lhs[i], rhs[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_scalar(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { size_t s = op0.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(op0[i], op1[i], op2[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& lhs, V& res, std::size_t number) { std::size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B blhs = B::load_aligned(&lhs[i]); B bres = f(blhs); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& lhs, V& res, std::size_t number) { std::size_t s = lhs.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B blhs = B::load_aligned(&lhs[i]), blhs2 = B::load_aligned(&lhs[j]), blhs3 = B::load_aligned(&lhs[k]), blhs4 = B::load_aligned(&lhs[l]); B bres = f(blhs); B bres2 = f(blhs2); B bres3 = f(blhs3); B bres4 = f(blhs4); bres.store_aligned(&res[i]); bres2.store_aligned(&res[j]); bres3.store_aligned(&res[k]); bres4.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& lhs, V& rhs, V& res, std::size_t number) { std::size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B blhs = B::load_aligned(&lhs[i]), brhs = B::load_aligned(&rhs[i]); B bres = f(blhs, brhs); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& lhs, V& rhs, V& res, std::size_t number) { std::size_t s = lhs.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B blhs = B::load_aligned(&lhs[i]), brhs = B::load_aligned(&rhs[i]), blhs2 = B::load_aligned(&lhs[j]), brhs2 = B::load_aligned(&rhs[j]); B blhs3 = B::load_aligned(&lhs[k]), brhs3 = B::load_aligned(&rhs[k]), blhs4 = B::load_aligned(&lhs[l]), brhs4 = B::load_aligned(&rhs[l]); B bres = f(blhs, brhs); B bres2 = f(blhs2, brhs2); B bres3 = f(blhs3, brhs3); B bres4 = f(blhs4, brhs4); bres.store_aligned(&res[i]); bres2.store_aligned(&res[j]); bres3.store_aligned(&res[k]); bres4.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { std::size_t s = op0.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B bop0 = B::load_aligned(&op0[i]), bop1 = B::load_aligned(&op1[i]), bop2 = B::load_aligned(&op2[i]); B bres = f(bop0, bop1, bop2); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { std::size_t s = op0.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B bop0_i = B::load_aligned(&op0[i]), bop1_i = B::load_aligned(&op1[i]), bop2_i = B::load_aligned(&op2[i]); B bop0_j = B::load_aligned(&op0[j]), bop1_j = B::load_aligned(&op1[j]), bop2_j = B::load_aligned(&op2[j]); B bop0_k = B::load_aligned(&op0[k]), bop1_k = B::load_aligned(&op1[k]), bop2_k = B::load_aligned(&op2[k]); B bop0_l = B::load_aligned(&op0[l]), bop1_l = B::load_aligned(&op1[l]), bop2_l = B::load_aligned(&op2[l]); B bres_i = f(bop0_i, bop1_i, bop2_i); B bres_j = f(bop0_j, bop1_j, bop2_j); B bres_k = f(bop0_k, bop1_k, bop2_k); B bres_l = f(bop0_l, bop1_l, bop2_l); bres_i.store_aligned(&res[i]); bres_j.store_aligned(&res[j]); bres_k.store_aligned(&res[k]); bres_l.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template void run_benchmark_1op(F f, OS& out, std::size_t size, std::size_t iter, init_method init = init_method::classic) { bench_vector f_lhs, f_rhs, f_res; bench_vector d_lhs, d_rhs, d_res; switch (init) { case init_method::classic: init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); break; case init_method::arctrigo: init_benchmark_arctrigo(f_lhs, f_rhs, f_res, size); init_benchmark_arctrigo(d_lhs, d_rhs, d_res, size); break; default: init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); break; } #ifndef XSIMD_POLY_BENCHMARKS duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_res, iter); duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_res, iter); #endif duration_type t_float_vector = benchmark_simd>(f, f_lhs, f_res, iter); duration_type t_float_vector_u = benchmark_simd_unrolled>(f, f_lhs, f_res, iter); #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 duration_type t_double_vector = benchmark_simd>(f, d_lhs, d_res, iter); duration_type t_double_vector_u = benchmark_simd_unrolled>(f, d_lhs, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; #ifndef XSIMD_POLY_BENCHMARKS out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; #endif out << "vector float : " << t_float_vector.count() << "ms" << std::endl; out << "vector float unr : " << t_float_vector_u.count() << "ms" << std::endl; #ifndef XSIMD_POLY_BENCHMARKS out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #endif #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 out << "vector double : " << t_double_vector.count() << "ms" << std::endl; out << "vector double unr : " << t_double_vector_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } template void run_benchmark_2op(F f, OS& out, std::size_t size, std::size_t iter) { bench_vector f_lhs, f_rhs, f_res; bench_vector d_lhs, d_rhs, d_res; init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_rhs, f_res, iter); duration_type t_float_vector = benchmark_simd>(f, f_lhs, f_rhs, f_res, iter); duration_type t_float_vector_u = benchmark_simd_unrolled>(f, f_lhs, f_rhs, f_res, iter); duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_rhs, d_res, iter); #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 duration_type t_double_vector = benchmark_simd>(f, d_lhs, d_rhs, d_res, iter); duration_type t_double_vector_u = benchmark_simd_unrolled>(f, d_lhs, d_rhs, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; out << "vector float : " << t_float_vector.count() << "ms" << std::endl; out << "vector float unr : " << t_float_vector_u.count() << "ms" << std::endl; out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 out << "vector double : " << t_double_vector.count() << "ms" << std::endl; out << "vector double unr : " << t_double_vector_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } template void run_benchmark_3op(F f, OS& out, std::size_t size, std::size_t iter) { bench_vector f_op0, f_op1, f_op2, f_res; bench_vector d_op0, d_op1, d_op2, d_res; init_benchmark(f_op0, f_op1, f_op2, f_res, size); init_benchmark(d_op0, d_op1, d_op2, d_res, size); duration_type t_float_scalar = benchmark_scalar(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_float_vector = benchmark_simd>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_float_vector_u = benchmark_simd_unrolled>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_double_scalar = benchmark_scalar(f, d_op0, d_op1, d_op2, d_res, iter); #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 duration_type t_double_vector = benchmark_simd>(f, d_op0, d_op1, d_op2, d_res, iter); duration_type t_double_vector_u = benchmark_simd_unrolled>(f, d_op0, d_op1, d_op2, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; out << "vector float : " << t_float_vector.count() << "ms" << std::endl; out << "vector float unr : " << t_float_vector_u.count() << "ms" << std::endl; out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 out << "vector double : " << t_double_vector.count() << "ms" << std::endl; out << "vector double unr : " << t_double_vector_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } #define DEFINE_OP_FUNCTOR_2OP(OP, NAME) \ struct NAME##_fn \ { \ template \ inline T operator()(const T& lhs, const T& rhs) const \ { \ return lhs OP rhs; \ } \ inline std::string name() const \ { \ return #NAME; \ } \ } #define DEFINE_FUNCTOR_1OP(FN) \ struct FN##_fn \ { \ template \ inline T operator()(const T& x) const \ { \ using xsimd::FN; \ return FN(x); \ } \ inline std::string name() const \ { \ return #FN; \ } \ } #define DEFINE_FUNCTOR_1OP_TEMPLATE(NAME, FN, N, ...) \ struct NAME##_##N##_fn \ { \ template \ inline T operator()(const T& x) const \ { \ using xsimd::FN; \ return FN(x); \ } \ inline std::string name() const \ { \ return #FN " " #N; \ } \ } #define DEFINE_FUNCTOR_2OP(FN) \ struct FN##_fn \ { \ template \ inline T operator()(const T& lhs, const T& rhs) const \ { \ using xsimd::FN; \ return FN(lhs, rhs); \ } \ inline std::string name() const \ { \ return #FN; \ } \ } #define DEFINE_FUNCTOR_3OP(FN) \ struct FN##_fn \ { \ template \ inline T operator()(const T& op0, const T& op1, const T& op2) const \ { \ using xsimd::FN; \ return FN(op0, op1, op2); \ } \ inline std::string name() const \ { \ return #FN; \ } \ } DEFINE_OP_FUNCTOR_2OP(+, add); DEFINE_OP_FUNCTOR_2OP(-, sub); DEFINE_OP_FUNCTOR_2OP(*, mul); DEFINE_OP_FUNCTOR_2OP(/, div); DEFINE_FUNCTOR_1OP(exp); DEFINE_FUNCTOR_1OP(exp2); DEFINE_FUNCTOR_1OP(expm1); DEFINE_FUNCTOR_1OP(log); DEFINE_FUNCTOR_1OP(log10); DEFINE_FUNCTOR_1OP(log2); DEFINE_FUNCTOR_1OP(log1p); DEFINE_FUNCTOR_1OP(sin); DEFINE_FUNCTOR_1OP(cos); DEFINE_FUNCTOR_1OP(tan); DEFINE_FUNCTOR_1OP(asin); DEFINE_FUNCTOR_1OP(acos); DEFINE_FUNCTOR_1OP(atan); DEFINE_FUNCTOR_1OP(sinh); DEFINE_FUNCTOR_1OP(cosh); DEFINE_FUNCTOR_1OP(tanh); DEFINE_FUNCTOR_1OP(asinh); DEFINE_FUNCTOR_1OP(acosh); DEFINE_FUNCTOR_1OP(atanh); DEFINE_FUNCTOR_2OP(pow); DEFINE_FUNCTOR_1OP(sqrt); DEFINE_FUNCTOR_1OP(cbrt); DEFINE_FUNCTOR_2OP(hypot); DEFINE_FUNCTOR_1OP(ceil); DEFINE_FUNCTOR_1OP(floor); DEFINE_FUNCTOR_1OP(trunc); DEFINE_FUNCTOR_1OP(round); DEFINE_FUNCTOR_1OP(nearbyint); DEFINE_FUNCTOR_1OP(rint); DEFINE_FUNCTOR_2OP(fmod); DEFINE_FUNCTOR_2OP(remainder); DEFINE_FUNCTOR_2OP(fdim); DEFINE_FUNCTOR_3OP(clip); #if 0 DEFINE_FUNCTOR_1OP(isfinite); DEFINE_FUNCTOR_1OP(isinf); DEFINE_FUNCTOR_1OP(is_flint); DEFINE_FUNCTOR_1OP(is_odd); DEFINE_FUNCTOR_1OP(is_even); #endif DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 5, 1, 2, 3, 4, 5); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 5, 1, 2, 3, 4, 5); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); } #endif xtensor-stack-xsimd-541558d/docs/000077500000000000000000000000001517435117100166575ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/docs/Doxyfile000066400000000000000000000047311517435117100203720ustar00rootroot00000000000000PROJECT_NAME = "xsimd" XML_OUTPUT = xml INPUT = ../include/xsimd/types/xsimd_api.hpp \ ../include/xsimd/types/xsimd_batch.hpp \ ../include/xsimd/types/xsimd_batch_constant.hpp \ ../include/xsimd/config/xsimd_arch.hpp \ ../include/xsimd/config/xsimd_config.hpp \ ../include/xsimd/memory/xsimd_alignment.hpp \ ../include/xsimd/memory/xsimd_aligned_allocator.hpp \ ../include/xsimd/types/xsimd_common_arch.hpp \ ../include/xsimd/types/xsimd_traits.hpp \ ../include/xsimd/types/xsimd_vsx_register.hpp \ ../include/xsimd/types/xsimd_avx2_register.hpp \ ../include/xsimd/types/xsimd_avx512bw_register.hpp \ ../include/xsimd/types/xsimd_avx512cd_register.hpp \ ../include/xsimd/types/xsimd_avx512dq_register.hpp \ ../include/xsimd/types/xsimd_avx512f_register.hpp \ ../include/xsimd/types/xsimd_avx_register.hpp \ ../include/xsimd/types/xsimd_fma3_avx_register.hpp \ ../include/xsimd/types/xsimd_fma3_avx2_register.hpp \ ../include/xsimd/types/xsimd_fma3_sse_register.hpp \ ../include/xsimd/types/xsimd_fma4_register.hpp \ ../include/xsimd/types/xsimd_neon64_register.hpp \ ../include/xsimd/types/xsimd_neon_register.hpp \ ../include/xsimd/types/xsimd_rvv_register.hpp \ ../include/xsimd/types/xsimd_sse2_register.hpp \ ../include/xsimd/types/xsimd_sse3_register.hpp \ ../include/xsimd/types/xsimd_sse4_1_register.hpp \ ../include/xsimd/types/xsimd_sse4_2_register.hpp \ ../include/xsimd/types/xsimd_ssse3_register.hpp \ ../include/xsimd/types/xsimd_sve_register.hpp GENERATE_LATEX = NO GENERATE_MAN = NO GENERATE_RTF = NO CASE_SENSE_NAMES = NO GENERATE_HTML = NO GENERATE_XML = YES RECURSIVE = YES QUIET = YES JAVADOC_AUTOBRIEF = YES WARN_IF_UNDOCUMENTED = NO WARN_AS_ERROR = NO ENABLE_PREPROCESSING = YES MACRO_EXPANSION = YES EXPAND_ONLY_PREDEF = YES PREDEFINED = XSIMD_NO_DISCARD= \ XSIMD_INLINE=inline \ DOXYGEN_SHOULD_SKIP_THIS= xtensor-stack-xsimd-541558d/docs/Makefile000066400000000000000000000147261517435117100203310ustar00rootroot00000000000000# You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext api default: html help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" @echo " coverage to run coverage check of the documentation (if enabled)" clean: rm -rf $(BUILDDIR)/* html: doxygen $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: doxygen $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: doxygen $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: doxygen $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: doxygen $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: doxygen $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." epub: doxygen $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: doxygen $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: doxygen $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: doxygen $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: doxygen $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: doxygen $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: doxygen $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: doxygen $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: doxygen $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." coverage: doxygen $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage @echo "Testing of coverage in the sources finished, look at the " \ "results in $(BUILDDIR)/coverage/python.txt." xml: doxygen $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: doxygen $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." xtensor-stack-xsimd-541558d/docs/environment.yml000066400000000000000000000001531517435117100217450ustar00rootroot00000000000000name: xsimd-docs channels: - conda-forge dependencies: - breathe - sphinx_rtd_theme - sphinx=6.* xtensor-stack-xsimd-541558d/docs/make.bat000066400000000000000000000161651517435117100202750ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source set I18NSPHINXOPTS=%SPHINXOPTS% source if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. xml to make Docutils-native XML files echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled echo. coverage to run coverage check of the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) REM Check if sphinx-build is available and fallback to Python version if any %SPHINXBUILD% 1>NUL 2>NUL if errorlevel 9009 goto sphinx_python goto sphinx_ok :sphinx_python set SPHINXBUILD=python -m sphinx.__init__ %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) :sphinx_ok if "%1" == "html" ( doxygen %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\packagename.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\packagename.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "coverage" ( %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage if errorlevel 1 exit /b 1 echo. echo.Testing of coverage in the sources finished, look at the ^ results in %BUILDDIR%/coverage/python.txt. goto end ) if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 echo. echo.Build finished. The XML files are in %BUILDDIR%/xml. goto end ) if "%1" == "pseudoxml" ( %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml if errorlevel 1 exit /b 1 echo. echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. goto end ) :end xtensor-stack-xsimd-541558d/docs/source/000077500000000000000000000000001517435117100201575ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/docs/source/_static/000077500000000000000000000000001517435117100216055ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/docs/source/_static/main_stylesheet.css000066400000000000000000000000741517435117100255150ustar00rootroot00000000000000.wy-nav-content{ max-width: 1000px; margin: auto; } xtensor-stack-xsimd-541558d/docs/source/api/000077500000000000000000000000001517435117100207305ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/docs/source/api/aligned_allocator.rst000066400000000000000000000007201517435117100251240ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Alignment Manipulation ====================== Aligned Memory Allocator ------------------------ .. doxygenclass:: xsimd::aligned_allocator :project: xsimd :members: Alignement Checker ------------------ .. doxygenfunction:: xsimd::is_aligned :project: xsimd xtensor-stack-xsimd-541558d/docs/source/api/arch.rst000066400000000000000000000017151517435117100224030ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Architecture Manipulation ========================= xsimd provides an high level description of the instruction sets it manipulates. The mentioned types are primarily used as template parameters for :ref:`batch `, and when interacting with :cpp:func:`xsimd::dispatch()`. The best available architecture is available at compile time through ``xsimd::best_arch`` which also happens to be ``xsimd::default_arch``. .. doxygengroup:: architectures :project: xsimd :members: Emulated Mode ------------- When compiled with the macro ``XSIMD_WITH_EMULATED`` set to ``1``, xsimd also exhibits a specific architecture ``xsimd::emulated``, which consists of a vector of ``N`` bits emulated using scalar mode. It is mostly available for testing and debugging. xtensor-stack-xsimd-541558d/docs/source/api/arithmetic_index.rst000066400000000000000000000116741517435117100250130ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html .. _Arithmetic Operations: Arithmetic Operations ===================== Binary operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`add` | per slot addition | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sub` | per slot subtraction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`mul` | per slot multiply | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`div` | per slot division | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`mod` | per slot modulo | +---------------------------------------+----------------------------------------------------+ Unary operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`neg` | per slot negate | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`pos` | per slot positive | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reciprocal` | per slot reciprocal | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`decr` | per slot decrement | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`decr_if` | per slot decrement, based on a mask | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`incr` | per slot increment | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`incr_if` | per slot increment, based on a mask | +---------------------------------------+----------------------------------------------------+ Saturated arithmetic: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sadd` | per slot saturated addition | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`ssub` | per slot saturated subtraction | +---------------------------------------+----------------------------------------------------+ Fused operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fma` | fused multiply add | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fms` | fused multiply sub | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fnma` | fused negate multiply add | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fnms` | fused negate multiply sub | +---------------------------------------+----------------------------------------------------+ Average computation: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`avg` | per slot average | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`avgr` | per slot rounded average | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_arithmetic :project: xsimd :content-only: xtensor-stack-xsimd-541558d/docs/source/api/batch_index.rst000066400000000000000000000005341517435117100237340ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. _Batch Types: Batch Types =========== .. toctree:: :maxdepth: 1 xsimd_batch xsimd_batch_bool xsimd_batch_complex xsimd_batch_constant xtensor-stack-xsimd-541558d/docs/source/api/batch_manip.rst000066400000000000000000000012771517435117100237360ustar00rootroot00000000000000.. Copyright (c) 2021, Serge Guelton Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Conditional Expression ====================== +------------------------------+-------------------------------------------+ | :cpp:func:`select` | conditional selection with mask | +------------------------------+-------------------------------------------+ ---- .. doxygengroup:: batch_cond :project: xsimd :content-only: In the specific case when one needs to conditionnaly increment or decrement a batch based on a mask, :cpp:func:`incr_if` and :cpp:func:`decr_if` provide specialized version. xtensor-stack-xsimd-541558d/docs/source/api/bitwise_operators_index.rst000066400000000000000000000046051517435117100264220ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Bitwise Operators ================= +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_not` | per slot bitwise not | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_or` | per slot bitwise or | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_xor` | per slot bitwise xor | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_and` | per slot bitwise and | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_andnot` | per slot bitwise and not | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_lshift` | per slot bitwise left shift | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_rshift` | per slot bitwise right shift | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotr` | per slot rotate right | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotl` | per slot rotate left | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_bitwise :project: xsimd :content-only: xtensor-stack-xsimd-541558d/docs/source/api/cast_index.rst000077500000000000000000000036761517435117100236220ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Type Conversion =============== Cast: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`batch_cast` | ``static_cast`` on batch types | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_cast` | ``reinterpret_cast`` on batch types | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`batch_bool_cast` | ``static_cast`` on batch predicate types | +---------------------------------------+----------------------------------------------------+ Conversion: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`to_float` | per slot conversion to floating point | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`to_int` | per slot conversion to integer | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`widen` | per slot conversion to twice as big type | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_conversion :project: xsimd :content-only: xtensor-stack-xsimd-541558d/docs/source/api/comparison_index.rst000066400000000000000000000062741517435117100250340ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Comparison Operators ==================== Ordering: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`eq` | per slot equals to comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`neq` | per slot different from comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`gt` | per slot strictly greater than comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`lt` | per slot strictly lower than comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`ge` | per slot greater or equal to comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`le` | per slot lower or equal to comparison | +---------------------------------------+----------------------------------------------------+ Parity check: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`is_even` | per slot check for evenness | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`is_odd` | per slot check for oddness | +---------------------------------------+----------------------------------------------------+ Floating point number check: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`isinf` | per slot check for infinity | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`isnan` | per slot check for NaN | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`isfinite` | per slot check for finite number | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`is_flint` | per slot check for float representing an integer | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_logical :project: xsimd :content-only: xtensor-stack-xsimd-541558d/docs/source/api/data_transfer.rst000066400000000000000000000116631517435117100243060ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. _Data Transfer: Data Transfers ============== From memory: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load` | load values from memory (optionally masked) | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_aligned` | load values from aligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_unaligned` | load values from unaligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_as` | load values, forcing a type conversion | +---------------------------------------+----------------------------------------------------+ From a scalar: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`broadcast` | broadcasting a value to all slots | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`broadcast_as` | broadcasting a value, forcing a type conversion | +---------------------------------------+----------------------------------------------------+ To memory: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store` | store values to memory (optionally masked) | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_aligned` | store values to aligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_unaligned` | store values to unaligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_as` | store values, forcing a type conversion | +---------------------------------------+----------------------------------------------------+ In place: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`swizzle` | rearrange slots within the batch | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`slide_left` | bitwise shift the whole batch to the left | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`slide_right` | bitwise shift the whole batch to the right | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotate_left` | bitwise rotate the whole batch to the left | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotate_right` | bitwise rotate the whole batch to the right | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`insert` | modify a single batch slot | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`compress` | pack elements according to a mask | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`expand` | select contiguous elements from the batch | +---------------------------------------+----------------------------------------------------+ Between batches: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`transpose` | tranpose a matrix as an array of batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`zip_lo` | interleave low halves of two batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`zip_hi` | interleave high halves of two batches | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_data_transfer :project: xsimd :content-only: The following empty types are used for tag dispatching: .. doxygenstruct:: xsimd::aligned_mode :project: xsimd .. doxygenstruct:: xsimd::unaligned_mode :project: xsimd xtensor-stack-xsimd-541558d/docs/source/api/dispatching.rst000066400000000000000000000036461517435117100237700ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html .. _Arch Dispatching: Arch Dispatching ================ `xsimd` provides a generic way to dispatch a function call based on the architecture the code was compiled for and the architectures available at runtime. The :cpp:func:`xsimd::dispatch` function takes a functor whose call operator takes an architecture parameter as first operand, followed by any number of arguments ``Args...`` and turn it into a dispatching functor that takes ``Args...`` as arguments. .. doxygenfunction:: xsimd::dispatch :project: xsimd Following code showcases a usage of the :cpp:func:`xsimd::dispatch` function: .. code-block:: c++ #include "sum.hpp" // Create the dispatching function, specifying the architecture we want to // target. auto dispatched = xsimd::dispatch>(sum{}); // Call the appropriate implementation based on runtime information. float res = dispatched(data, 17); This code does *not* require any architecture-specific flags. The architecture specific details follow. The ``sum.hpp`` header contains the function being actually called, in an architecture-agnostic description: .. literalinclude:: ../../../test/doc/sum.hpp The SSE2 and AVX2 version needs to be provided in other compilation units, compiled with the appropriate flags, for instance: .. literalinclude:: ../../../test/doc/sum_avx2.cpp .. literalinclude:: ../../../test/doc/sum_sse2.cpp xtensor-stack-xsimd-541558d/docs/source/api/instr_macros.rst000066400000000000000000000021371517435117100241700ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Instruction Set Macros ====================== Each of these macros corresponds to an instruction set supported by XSIMD. They can be used to filter arch-specific code. .. doxygengroup:: xsimd_config_macro :project: xsimd :content-only: Changing Default Architecture ***************************** You can change the default instruction set used by xsimd (when none is provided explicitely) by setting the ``XSIMD_DEFAULT_ARCH`` macro to, say, ``xsimd::avx2``. A common usage is to set it to ``xsimd::unsupported`` as a way to detect instantiation of batches with the default architecture. xtensor-stack-xsimd-541558d/docs/source/api/math_index.rst000066400000000000000000000245771517435117100236210ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Mathematical Functions ====================== Basic functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`abs` | absolute value | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fabs` | absolute value of floating point values | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fmod` | remainder of the floating point division operation | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`remainder` | signed remainder of the division operation | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`min` | smaller of two batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`max` | larger of two batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fmin` | smaller of two batches of floating point values | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fmax` | larger of two batches of floating point values | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fdim` | positive difference | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`clip` | clipping operation | +---------------------------------------+----------------------------------------------------+ Exponential functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`exp` | natural exponential function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`exp2` | base 2 exponential function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`exp10` | base 10 exponential function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`expm1` | natural exponential function, minus one | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log` | natural logarithm function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log2` | base 2 logarithm function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log10` | base 10 logarithm function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log1p` | natural logarithm of one plus function | +---------------------------------------+----------------------------------------------------+ Power functions: +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`pow` | power function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`rsqrt` | reciprocal square root function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`sqrt` | square root function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`cbrt` | cubic root function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`hypot` | hypotenuse function | +-----------------------------------------+----------------------------------------------------+ Trigonometric functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sin` | sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`cos` | cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sincos` | sine and cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`tan` | tangent function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`asin` | arc sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`acos` | arc cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`atan` | arc tangent function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`atan2` | arc tangent function, determining quadrants | +---------------------------------------+----------------------------------------------------+ Hyperbolic functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sinh` | hyperbolic sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`cosh` | hyperbolic cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`tanh` | hyperbolic tangent function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`asinh` | inverse hyperbolic sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`acosh` | inverse hyperbolic cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`atanh` | inverse hyperbolic tangent function | +---------------------------------------+----------------------------------------------------+ Error functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`erf` | error function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`erfc` | complementary error function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`tgamma` | gamma function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`lgamma` | natural logarithm of the gamma function | +---------------------------------------+----------------------------------------------------+ Nearint operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`ceil` | nearest integers not less | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`floor` | nearest integers not greater | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`trunc` | nearest integers not greater in magnitude | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`round` | nearest integers, rounding away from zero | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`nearbyint` | nearest integers using current rounding mode | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rint` | nearest integers using current rounding mode | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_math :project: xsimd :content-only: .. doxygengroup:: batch_trigo :project: xsimd :content-only: .. doxygengroup:: batch_rounding :project: xsimd :content-only: .. doxygengroup:: batch_math_extra :project: xsimd :content-only: xtensor-stack-xsimd-541558d/docs/source/api/misc_index.rst000066400000000000000000000034221517435117100236050ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Miscellaneous ============= Sign manipulation: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sign` | per slot sign extraction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`signnz` | per slot sign extraction on non null elements | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitofsign` | per slot sign bit extraction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`copysign` | per slot sign copy | +---------------------------------------+----------------------------------------------------+ Stream operation: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`operator<<` | batch pretty-printing | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_miscellaneous :project: xsimd :content-only: xtensor-stack-xsimd-541558d/docs/source/api/reducer_index.rst000066400000000000000000000036141517435117100243060ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Reduction Operators =================== +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce` | generic batch reduction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce_add` | sum of each batch element | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce_max` | max of the batch elements | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce_min` | min of the batch elements | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce_mul` | product of the batch elements | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`haddp` | horizontal sum across batches | +---------------------------------------+----------------------------------------------------+ Also see the `batch_bool` :ref:`xsimd-batch-bool-reducers`. ---- .. doxygengroup:: batch_reducers :project: xsimd :content-only: xtensor-stack-xsimd-541558d/docs/source/api/type_traits.rst000066400000000000000000000042401517435117100240310ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html .. _Type Traits: Type Traits =========== `xsimd` provides a few type traits to interact with scalar and batch types in an uniform manner. Combined traits: +---------------------------------------+----------------------------------------------------+ | :cpp:class:`batch_traits` | batch types and proprties | +---------------------------------------+----------------------------------------------------+ Type check: +---------------------------------------+----------------------------------------------------+ | :cpp:class:`is_batch` | batch type detection | +---------------------------------------+----------------------------------------------------+ | :cpp:class:`is_batch_bool` | mask batch type detection | +---------------------------------------+----------------------------------------------------+ | :cpp:class:`is_batch_complex` | complex batch type detection | +---------------------------------------+----------------------------------------------------+ Type access: +---------------------------------------+----------------------------------------------------+ | :cpp:class:`scalar_type` | batch element type | +---------------------------------------+----------------------------------------------------+ | :cpp:class:`mask_type` | batch mask type | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_traits :project: xsimd :content-only: xtensor-stack-xsimd-541558d/docs/source/api/xsimd_batch.rst000066400000000000000000000006061517435117100237510ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of Scalars ================ .. _xsimd-batch-ref: .. doxygenclass:: xsimd::batch :project: xsimd :members: .. doxygenstruct:: xsimd::make_sized_batch :project: xsimd :members: xtensor-stack-xsimd-541558d/docs/source/api/xsimd_batch_bool.rst000066400000000000000000000010711517435117100247610ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of Conditions =================== .. _xsimd-batch-bool-ref: .. doxygenclass:: xsimd::batch_bool :project: xsimd :members: Logical Operators ----------------- .. doxygengroup:: batch_bool_logical :project: xsimd :content-only: .. _xsimd-batch-bool-reducers: Reducers -------- .. doxygengroup:: batch_bool_reducers :project: xsimd :content-only: xtensor-stack-xsimd-541558d/docs/source/api/xsimd_batch_complex.rst000066400000000000000000000015041517435117100254760ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of Complex Numbers ======================== .. doxygenclass:: xsimd::batch< std::complex< T >, A > :project: xsimd :members: Operations Specific to Batches of Complex Numbers ------------------------------------------------- .. doxygengroup:: batch_complex :project: xsimd :content-only: XTL Complex Support ------------------- If the preprocessor token ``XSIMD_ENABLE_XTL_COMPLEX`` is defined, ``xsimd`` provides constructors of ``xsimd::batch< std::complex< T >, A >`` from ``xtl::xcomplex``, similar to those for ``std::complex``. This requires `XTL`_ to be installed. .. _XTL: https://github.com/xtensor-stack/xtl xtensor-stack-xsimd-541558d/docs/source/api/xsimd_batch_constant.rst000066400000000000000000000014101517435117100256540ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of Constants ================== .. _xsimd-batch-constant-ref: .. doxygenstruct:: xsimd::batch_constant :project: xsimd :members: .. doxygenstruct:: xsimd::batch_bool_constant :project: xsimd :members: .. doxygenfunction:: xsimd::make_batch_constant :project: xsimd .. doxygenfunction:: xsimd::make_batch_bool_constant :project: xsimd .. note:: :cpp:func:`make_batch_constant` and :cpp:func:`make_batch_bool_constant` also accept a scalar value instead of a generator. In that case, that value is broadcast to each slot of the constant batch. xtensor-stack-xsimd-541558d/docs/source/basic_usage.rst000066400000000000000000000043001517435117100231530ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Basic Usage =========== Manipulating Abstract Batches ----------------------------- Here is an example that computes the mean of two batches, using the best architecture available, based on compile time informations: .. literalinclude:: ../../test/doc/manipulating_abstract_batches.cpp There is no explicit architectural information available in the code, those are deduced from the compiler target and its vector instruction support. If several vector instructions sets are supported, the one with widest register width and most operations is picked (e.g. AVX2 over AVX over SSE4.1). There is no explicit register size information available in the code, those solely depend on the architecture picked, as stated above. The batch can be a batch of 4 single precision floating point numbers (e.g. on Neon) or a batch of 8 (e.g. on AVX2). Manipulating Parametric Batches ------------------------------- The implicit architectural information from previous example can be made explicit, and the type used can be parametric. This is achieved as described in the following example: .. literalinclude:: ../../test/doc/manipulating_parametric_batches.cpp At its core, a :cpp:class:`xsimd::batch` is bound to the scalar type it contains, and to the instruction set it can use to operate on its values. Explicit Use of an Instruction Set Extension -------------------------------------------- Here is an example that loads two batches of 4 double floating point values, and computes their mean, explicitly using the AVX extension: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set.cpp Note that in that case, the instruction set is explicilty specified in the batch type. The flag passed down to the compiler need to make it possible for this architecture to be used. This example outputs: .. code:: (2.0, 3.0, 4.0, 5.0) .. warning:: If you allow your compiler to generate AVX2 instructions (e.g. through ``-mavx2``) there is nothing preventing it from optimizing the above code using AVX2 instructions. xtensor-stack-xsimd-541558d/docs/source/cmake.svg000066400000000000000000000425311517435117100217650ustar00rootroot00000000000000 image/svg+xml xtensor-stack-xsimd-541558d/docs/source/conda.svg000066400000000000000000000034151517435117100217670ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/docs/source/conf.py000066400000000000000000000015321517435117100214570ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import subprocess on_rtd = os.environ.get('READTHEDOCS', None) == 'True' if on_rtd: subprocess.check_call('cd ..; doxygen', shell=True) import sphinx_rtd_theme html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] def setup(app): app.add_css_file("main_stylesheet.css") extensions = ['breathe', 'sphinx_rtd_theme'] breathe_projects = { 'xsimd': '../xml' } templates_path = ['_templates'] html_static_path = ['_static'] source_suffix = '.rst' master_doc = 'index' project = 'xsimd' copyright = '2016, Johan Mabille and Sylvain Corlay' author = 'Johan Mabille and Sylvain Corlay' html_logo = 'quantstack-white.svg' exclude_patterns = [] highlight_language = 'c++' pygments_style = 'sphinx' todo_include_todos = False htmlhelp_basename = 'xsimddoc' xtensor-stack-xsimd-541558d/docs/source/index.rst000066400000000000000000000115021517435117100220170ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. image:: xsimd.svg :alt: xsimd C++ wrappers for SIMD intrinsics. Introduction ------------ `SIMD`_ (Single Instruction, Multiple Data) is a feature of microprocessors that has been available for many years. SIMD instructions perform a single operation on a batch of values at once, and thus provide a way to significantly accelerate code execution. However, these instructions differ between microprocessor vendors and compilers. `xsimd` provides a unified means for using these features for library authors. The core of the library consist in a parametrized vector type, :ref:`Batch Types`, and a set of operations to perform :ref:`Arithmetic Operations`, :ref:`Data Transfer`, and many common mathemtical functions, as for single values. There are several ways to use `xsimd` using those :ref:`Batch Types` and operations: - one can write a generic, vectorized, algorithm and compile it as part of their application build, with the right architecture flag; - one can write a generic, vectorized, algorithm and compile several version of it by just changing the architecture flags, then pick the best version at runtime; - one can write a vectorized algorithm specialized for a given architecture and still benefit from the high-level abstraction proposed by `xsimd`. Of course, nothing prevents the combination of several of those approach, but more about this in section :ref:`Writing vectorized code`. You can find out more about this implementation of C++ wrappers for SIMD intrinsics at the `The C++ Scientist`_. The mathematical functions are a lightweight implementation of the algorithms also used in `boost.SIMD`_. Compiler and Architecture Support --------------------------------- The following SIMD instruction set extensions are supported: +--------------+---------------------------------------------------------+ | Architecture | Instruction set extensions | +==============+=========================================================+ | x86 | SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, FMA3, AVX2 | +--------------+---------------------------------------------------------+ | x86 | AVX512 (gcc7 and higher) | +--------------+---------------------------------------------------------+ | x86 AMD | same as above + FMA4 | +--------------+---------------------------------------------------------+ | ARM | ARMv7, ARMv8 | +--------------+---------------------------------------------------------+ | WebAssembly | WASM | +--------------+---------------------------------------------------------+ | Risc-V | Vector ISA | +--------------+---------------------------------------------------------+ | PowerPC | VSX | +--------------+---------------------------------------------------------+ `xsimd` requires a C++14 compliant compiler. The following C++ compilers are supported: +-------------------------+-------------------------------+ | Compiler | Version | +=========================+===============================+ | Microsoft Visual Studio | MSVC 2015 update 2 and above | +-------------------------+-------------------------------+ | g++ | 4.9 and above | +-------------------------+-------------------------------+ | clang | 3.7 and above | +-------------------------+-------------------------------+ Licensing --------- We use a shared copyright model that enables all contributors to maintain the copyright on their contributions. This software is licensed under the BSD-3-Clause license. See the LICENSE file for details. .. toctree:: :caption: INSTALLATION :maxdepth: 2 installation .. toctree:: :caption: USAGE :maxdepth: 2 basic_usage vectorized_code integration .. toctree:: :caption: API REFERENCE :maxdepth: 1 api/instr_macros api/batch_index api/data_transfer api/arithmetic_index api/comparison_index api/bitwise_operators_index api/math_index api/reducer_index api/cast_index api/type_traits api/batch_manip api/misc_index api/aligned_allocator api/arch api/dispatching .. toctree:: :caption: MIGRATION GUIDE :maxdepth: 1 migration_guide .. _SIMD: https://fr.wikipedia.org/wiki/Single_instruction_multiple_data .. _The C++ Scientist: http://johanmabille.github.io/blog/archives/ .. _boost.SIMD: https://github.com/NumScale/boost.simd xtensor-stack-xsimd-541558d/docs/source/installation.rst000066400000000000000000000034341517435117100234160ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Installation ============ `xsimd` is a header-only library, so installing it is just a matter of copying the ``include/xsimd`` directory. However we provide standardized means to install it, with package managers or with cmake. Besides the `xsimd` headers, all these methods place the ``cmake`` project configuration file in the right location so that third-party projects can use cmake's ``find_package`` to locate `xsimd` headers. .. image:: conda.svg Using the conda-forge Package ----------------------------- A package for `xsimd` is available for the `mamba `_ (or `conda `_) package manager. .. code:: mamba install -c conda-forge xsimd .. image:: spack.svg Using the Spack Package ----------------------- A package for `xsimd` is available on the `Spack `_ package manager. .. code:: spack install xsimd spack load xsimd .. image:: cmake.svg From Source with cmake ---------------------- You can install `xsimd` from source with `cmake `_. On Unix platforms, from the source directory: .. code:: mkdir build cd build cmake -DCMAKE_INSTALL_PREFIX=/path/to/prefix .. make install On Windows platforms, from the source directory: .. code:: mkdir build cd build cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=/path/to/prefix .. nmake nmake install xtensor-stack-xsimd-541558d/docs/source/integration.rst000066400000000000000000000030001517435117100232250ustar00rootroot00000000000000.. Copyright (c) 2025, Serge Guelton Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Integration =========== When Targeting a Single Architecture ------------------------------------ If you compile your whole project for a single architecture, you can rely on the implicit architecture parameter for :cpp:class:`xsimd::batch`. Just add your source using `xsimd` to your project build system, pass down the appropriate flags and the magic should happen. It's very common though to have a base application with minimal architectural constraints, while still wanting to benefit from the acceleration of better instruction sets if those are available. When Targeting Multiple Architectures ------------------------------------- It's very common, especially when targeting Intel hardware, to set a minimal baseline, say SSE2, for the base application, while still shipping computation kernels specialized for SSE4.2, AVX2 or AVX512BF. In that case one can write specific kernels for each targeted instruction set (or a generic one that's instantiated for each targeted instruction set). Those kernels must then be compiled with the appropriate flags independently, and linked into the application. `xsimd` provides a generic dispatch mechanism that can be used from the *base application* to pick the best kernel *at runtime* based on runtime detection of the supported architectures, as described more in detailed in :ref:`Arch Dispatching`. xtensor-stack-xsimd-541558d/docs/source/migration_guide.rst000066400000000000000000000056031517435117100240630ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html From 7.x to 8.x =============== Version 8.x introduces a lot of API difference compared to version 7.x. This section motivates the version bump and details the most notable changes. Why 8.x ------- Version 8.x introduces a new concept in `xsimd`: all batch types are now parametrized by a type, say ``double``, and an optional architecture, say ``avx512``, as in ``batch``. It is still possible to just require a batch of doubles and let the library pick the most appropriate architecture, as in ``batch``. This new design make it possible to target multiple architecture from the same code, as detailed in the :ref:`Arch Dispatching` section. As a side effect of this (almost full) rewrite of the library code, `xsimd` is now twice as fast to compile, and its source code size as been (roughly) divided by two. The `xsimd` developers also took this as an opportnuity to significantly improve test coverage. Most Notable Changes -------------------- Batch Types *********** The second argument of :cpp:class:`xsimd::batch` is now a type that represents an architecture, instead of an integer. The previous behavior can be emulated through the :cpp:class:`xsimd::make_sized_batch` utility. Batch of Complex Types ********************** Loading a batch of complex from an ``xtl::xcomplex`` now yields an ``xsimd::batch>`` instead of an ``xtl::xcomplex``. It is still possible to store an ``xsimd::batch>`` to an ``xtl::xcomplex``. Loading Batches *************** ``xsimd::batch::load*`` are now static functions. It is no longer supported to update an existing batch through its ``load`` method. The regular assign operator can be used instead. Indexing Batches **************** ``xsimd::batch::operator[](size_t)`` has been replaced with ``xsimd::batch::get(size_t)``. Keep in mind that this method implies a register load *for each call*, so it's wise not to use it in performance-critical section. When needed, do an explicit store of the batch into an array and work from there. Architecture Detection ********************** Many macros have been replaced by more elaborated constructs. ``XSIMD_INSTR_SET_AVAILABLE`` has been replaced by the type alias ``xsimd::default_arch``. Likewise architecture-specific macros like ``XSIMD_X86_INSTR_SET_AVAILABLE`` has been replaced by ``xsimd::upported_architectures::contains()``. Macro like ``XSIMD_WITH_SSE3`` are still defined to ``0`` or ``1`` to guard architecture-specific code. xtensor-stack-xsimd-541558d/docs/source/quantstack-white.svg000066400000000000000000000116361517435117100242030ustar00rootroot00000000000000 image/svg+xmlxtensor-stack-xsimd-541558d/docs/source/spack.svg000066400000000000000000000046711517435117100220110ustar00rootroot00000000000000 xtensor-stack-xsimd-541558d/docs/source/vectorized_code.rst000066400000000000000000000101631517435117100240620ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. _Writing Vectorized Code: Writing Vectorized Code ======================= Assume that we have a simple function that computes the mean of two vectors, something like: .. literalinclude:: ../../test/doc/writing_vectorized_code.cpp How can we use `xsimd` to take advantage of vectorization? Explicit Use of an Instruction Set ---------------------------------- `xsimd` provides the template class :cpp:class:`xsimd::batch` parametrized by ``T`` and ``A`` types where ``T`` is the type of the values involved in SIMD instructions and ``A`` is the target architecture. If you know which instruction set is available on your machine, you can directly use the corresponding specialization of ``batch``. For instance, assuming the AVX instruction set is available, the previous code can be vectorized the following way: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean.cpp Note that the code is written in a form that's independent from the actual vector register width. However, if you want to write code that is portable, you cannot rely on the use of ``batch``. Indeed this won't compile on a CPU where only SSE2 instruction set is available for instance. Fortunately, if you don't set the second template parameter, `xsimd` picks the best architecture among the one available, based on the compiler flag you use. Aligned vs Unaligned Memory --------------------------- In the previous example, you may have noticed the :cpp:func:`xsimd::batch::load_unaligned` and :cpp:func:`xsimd::batch::store_unaligned` functions. These are meant for loading values from contiguous dynamically allocated memory into SIMD registers and reciprocally. When dealing with memory transfer operations, some instructions sets required the memory to be aligned by a given amount, others can handle both aligned and unaligned modes. In that latter case, operating on aligned memory is generally faster than operating on unaligned memory. `xsimd` provides an aligned memory allocator, namely :cpp:class:`xsimd::aligned_allocator` which follows the standard requirements, so it can be used with STL containers. Let's change the previous code so it can take advantage of this allocator: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp Memory Alignment and Tag Dispatching ------------------------------------ You may need to write code that can operate on any type of vectors or arrays, not only the STL ones. In that case, you cannot make assumption on the memory alignment of the container. `xsimd` provides a tag dispatching mechanism that allows you to easily write such a generic code: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp Here, the ``Tag`` template parameter can be :cpp:class:`xsimd::aligned_mode` or :cpp:class:`xsimd::unaligned_mode`. Assuming the existence of a ``get_alignment_tag`` meta-function in the code, the previous code can be invoked this way: .. code:: mean(a, b, res, get_alignment_tag()); Writing Arch-Independent Code ----------------------------- If your code may target either SSE2, AVX2 or AVX512 instruction set, `xsimd` make it possible to make your code even more generic by using the architecture as a template parameter: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp Then you just need to ``#include`` that file, force instantiation for a specific architecture and pass the appropriate flag to the compiler. For instance: .. literalinclude:: ../../test/doc/sum_sse2.cpp This can be useful to implement runtime dispatching, based on the instruction set detected at runtime. `xsimd` provides a generic machinery :cpp:func:`xsimd::dispatch()` to implement this pattern. Based on the above example, instead of calling ``mean{}(arch, a, b, res, tag)``, one can use ``xsimd::dispatch(mean{})(a, b, res, tag)``. More about this can be found in the :ref:`Arch Dispatching` section. xtensor-stack-xsimd-541558d/docs/source/xsimd.svg000066400000000000000000000055741517435117100220370ustar00rootroot00000000000000 xtensor-stack-xsimd-541558d/environment.yml000066400000000000000000000001201517435117100210070ustar00rootroot00000000000000name: xsimd channels: - conda-forge dependencies: - ninja - xtl - doctest xtensor-stack-xsimd-541558d/examples/000077500000000000000000000000001517435117100175455ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/examples/CMakeLists.txt000066400000000000000000000036621517435117100223140ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # Copyright (c) Serge Guelton # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.13) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-examples) find_package(xsimd REQUIRED CONFIG) endif () if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting examples build type to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) else() message(STATUS "Tests build type is ${CMAKE_BUILD_TYPE}") endif() if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") if (NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mtune=native") endif() if(NOT CMAKE_CXX_COMPILER_ID MATCHES Clang) # We are using clang-cl set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") endif() endif() add_executable(mandelbrot mandelbrot.cpp) target_link_libraries(mandelbrot PRIVATE xsimd) set_property(TARGET mandelbrot PROPERTY CXX_STANDARD 14) if(ENABLE_XTL_COMPLEX) target_link_libraries(mandelbrot PRIVATE xtl) endif() add_custom_target(xmandelbrot COMMAND mandelbrot DEPENDS mandelbrot) xtensor-stack-xsimd-541558d/examples/mandelbrot.cpp000066400000000000000000000233061517435117100224040ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ // This file is derived from tsimd (MIT License) // https://github.com/ospray/tsimd/blob/master/benchmarks/mandelbrot.cpp // Author Jefferson Amstutz / intel #include #include #include #include #include "pico_bench.hpp" #include // helper function to write the rendered image as PPM file inline void writePPM(const std::string& fileName, const int sizeX, const int sizeY, const int* pixel) { FILE* file = fopen(fileName.c_str(), "wb"); fprintf(file, "P6\n%i %i\n255\n", sizeX, sizeY); unsigned char* out = (unsigned char*)alloca(3 * sizeX); for (int y = 0; y < sizeY; y++) { const unsigned char* in = (const unsigned char*)&pixel[(sizeY - 1 - y) * sizeX]; for (int x = 0; x < sizeX; x++) { out[3 * x + 0] = in[4 * x + 0]; out[3 * x + 1] = in[4 * x + 1]; out[3 * x + 2] = in[4 * x + 2]; } fwrite(out, 3 * sizeX, sizeof(char), file); } fprintf(file, "\n"); fclose(file); } namespace xsimd { template inline batch mandel(const batch_bool& _active, const batch& c_re, const batch& c_im, int maxIters) { using float_batch_type = batch; using int_batch_type = batch; constexpr std::size_t N = float_batch_type::size; float_batch_type z_re = c_re; float_batch_type z_im = c_im; int_batch_type vi(0); for (int i = 0; i < maxIters; ++i) { auto active = _active & ((z_re * z_re + z_im * z_im) <= float_batch_type(4.f)); if (!xsimd::any(active)) { break; } float_batch_type new_re = z_re * z_re - z_im * z_im; float_batch_type new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; vi = select(batch_bool_cast(active), vi + 1, vi); } return vi; } template void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIters, int output[]) { using float_batch_type = batch; using int_batch_type = batch; constexpr std::size_t N = float_batch_type::size; float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; float arange[N]; std::iota(&arange[0], &arange[N], 0.f); // float_batch_type programIndex(&arange[0], xsimd::aligned_mode()); auto programIndex = float_batch_type::load(&arange[0], xsimd::aligned_mode()); // std::iota(programIndex.begin(), programIndex.end(), 0.f); for (int j = 0; j < height; j++) { for (int i = 0; i < width; i += N) { float_batch_type x(x0 + (i + programIndex) * dx); float_batch_type y(y0 + j * dy); auto active = x < float_batch_type(width); int base_index = (j * width + i); auto result = mandel(active, x, y, maxIters); // implement masked store! // xsimd::store_aligned(result, output + base_index, active); int_batch_type prev_data = int_batch_type::load_unaligned(output + base_index); select(batch_bool_cast(active), result, prev_data) .store_aligned(output + base_index); } } } } // namespace xsimd // omp version //////////////////////////////////////////////////////////////// namespace omp { #pragma omp declare simd template inline int mandel(T c_re, T c_im, int count) { T z_re = c_re, z_im = c_im; int i; for (i = 0; i < count; ++i) { if (z_re * z_re + z_im * z_im > 4.f) { break; } T new_re = z_re * z_re - z_im * z_im; T new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; } return i; } void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIterations, int output[]) { float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; for (int j = 0; j < height; j++) { #pragma omp simd for (int i = 0; i < width; ++i) { float x = x0 + i * dx; float y = y0 + j * dy; int index = (j * width + i); output[index] = mandel(x, y, maxIterations); } } } } // namespace omp // scalar version ///////////////////////////////////////////////////////////// namespace scalar { inline int mandel(float c_re, float c_im, int count) { float z_re = c_re, z_im = c_im; int i; for (i = 0; i < count; ++i) { if (z_re * z_re + z_im * z_im > 4.f) { break; } float new_re = z_re * z_re - z_im * z_im; float new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; } return i; } void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIterations, int output[]) { float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; for (int j = 0; j < height; j++) { for (int i = 0; i < width; ++i) { float x = x0 + i * dx; float y = y0 + j * dy; int index = (j * width + i); output[index] = mandel(x, y, maxIterations); } } } } // namespace scalar // run simd version of mandelbrot benchmark for a specific arch template void run_arch( bencher_t& bencher, float x0, float y0, float x1, float y1, int width, int height, int maxIters, std::vector>& buffer) { std::fill(buffer.begin(), buffer.end(), 0); auto stats = bencher([&]() { xsimd::mandelbrot(x0, y0, x1, y1, width, height, maxIters, buffer.data()); }); const float scalar_min = stats.min().count(); std::cout << '\n' << arch::name() << " " << stats << '\n'; auto filename = std::string("mandelbrot_") + std::string(arch::name()) + std::string(".ppm"); writePPM(filename.c_str(), width, height, buffer.data()); } template struct run_archlist; // run simd version of mandelbrot benchmark for a list // of archs template struct run_archlist> { template static void run( bencher_t& bencher, float x0, float y0, float x1, float y1, int width, int height, int maxIters, std::vector>& buffer) { (void)std::initializer_list { (run_arch(bencher, x0, y0, x1, x1, width, height, maxIters, buffer), 0)... }; } }; int main() { using namespace std::chrono; const unsigned int width = 1024; const unsigned int height = 768; const float x0 = -2; const float x1 = 1; const float y0 = -1; const float y1 = 1; const int maxIters = 256; std::vector> buf(width * height); auto bencher = pico_bench::Benchmarker { 64, seconds { 10 } }; std::cout << "starting benchmarks (results in 'ms')... " << '\n'; // scalar run /////////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_scalar = bencher([&]() { scalar::mandelbrot(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float scalar_min = stats_scalar.min().count(); std::cout << '\n' << "scalar " << stats_scalar << '\n'; writePPM("mandelbrot_scalar.ppm", width, height, buf.data()); // omp run ////////////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_omp = bencher([&]() { omp::mandelbrot(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float omp_min = stats_omp.min().count(); std::cout << '\n' << "omp " << stats_omp << '\n'; writePPM("mandelbrot_omp.ppm", width, height, buf.data()); run_archlist::run(bencher, x0, y0, x1, y1, width, height, maxIters, buf); return 0; } xtensor-stack-xsimd-541558d/examples/pico_bench.hpp000066400000000000000000000175641517435117100223640ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ // This file is derived from tsimd (MIT License) // https://github.com/ospray/tsimd/blob/master/benchmarks/pico_bench.h // Author Jefferson Amstutz / intel #ifndef PICO_BENCH_H #define PICO_BENCH_H #include #include #include #include #include #include #include #include #include #include namespace pico_bench { /* Statistics on some time measurement value T, e.g. T = * std::chrono::milliseconds T must be some std::chrono::duration type */ template class Statistics { using rep = typename T::rep; std::vector samples; public: std::string time_suffix; Statistics(std::vector s) : samples(s) { std::sort(std::begin(samples), std::end(samples)); } T percentile(const float p) const { return percentile(p, samples); } // Winsorize the data, sets all entries above 100 - limit percentile and // below limit percentile to the value of that percentile void winsorize(const float limit) { winsorize(limit, samples); } T median() const { return percentile(50.0, samples); } T median_abs_dev() const { const auto m = median(); std::vector deviations; deviations.reserve(samples.size()); std::transform(std::begin(samples), std::end(samples), std::back_inserter(deviations), [&m](const T& t) { return T { std::abs((t - m).count()) }; }); std::sort(std::begin(deviations), std::end(deviations)); return percentile(50.0, deviations); } T mean() const { const auto m = std::accumulate(std::begin(samples), std::end(samples), T { 0 }); return m / samples.size(); } T std_dev() const { const auto m = mean(); auto val = std::accumulate( std::begin(samples), std::end(samples), T { 0 }, [&m](const T& p, const T& t) { return T { static_cast(p.count() + std::pow((t - m).count(), 2)) }; }); return T { static_cast(std::sqrt(1.0 / static_cast(samples.size()) * static_cast(val.count()))) }; } T min() const { return samples.front(); } T max() const { return samples.back(); } std::size_t size() const { return samples.size(); } const T& operator[](size_t i) const { return samples[i]; } private: // Winsorize the data, sets all entries above 100 - limit percentile and // below limit percentile to the value of that percentile static void winsorize(const float limit, std::vector& samples) { const auto low = percentile(limit, samples); const auto high = percentile(100.0 - limit, samples); for (auto& t : samples) { if (t < low) { t = low; } else if (t > high) { t = high; } } } static T percentile(const float p, const std::vector& samples) { assert(!samples.empty()); assert(p <= 100.0); assert(p >= 0.0); if (samples.size() == 1) { return samples.front(); } if (p == 100.0) { return samples.back(); } const double rank = p / 100.0 * (static_cast(samples.size()) - 1.0); const double low_r = std::floor(rank); const double dist = rank - low_r; const size_t k = static_cast(low_r); const auto low = samples[k]; const auto high = samples[k + 1]; return T { static_cast(low.count() + (high - low).count() * dist) }; } }; /* Benchmarking measurment using some desired unit of time measurement, * e.g. T = std::chrono::milliseconds. T must be some std::chrono::duration */ template class Benchmarker { const size_t MAX_ITER; const T MAX_RUNTIME; template struct BenchWrapper { Fn fn; BenchWrapper(Fn fn) : fn(fn) { } T operator()() { auto start = std::chrono::high_resolution_clock::now(); fn(); auto end = std::chrono::high_resolution_clock::now(); return std::chrono::duration_cast(end - start); } }; public: using stats_type = Statistics; // Benchmark the functions either max_iter times or until max_runtime // seconds have elapsed max_runtime should be > 0 Benchmarker(const size_t max_iter, const std::chrono::seconds max_runtime) : MAX_ITER(max_iter) , MAX_RUNTIME(std::chrono::duration_cast(max_runtime)) { } // Create a benchmarker that will run the function for the desired number of // iterations, regardless of how long it takes Benchmarker(const size_t max_iter) : MAX_ITER(max_iter) , MAX_RUNTIME(0) { } template std::enable_if_t()())>::value, stats_type> operator()(Fn fn) const { return (*this)(BenchWrapper { fn }); } template std::enable_if_t()()), T>::value, stats_type> operator()(Fn fn) const { // Do a single un-timed warm up run fn(); T elapsed { 0 }; std::vector samples; for (size_t i = 0; i < MAX_ITER && (MAX_RUNTIME.count() == 0 || elapsed < MAX_RUNTIME); ++i, elapsed += samples.back()) { samples.push_back(fn()); } return stats_type { samples }; } }; } // namespace pico_bench template std::ostream& operator<<(std::ostream& os, const pico_bench::Statistics& stats) { os << "Statistics:\n" << "\tmax: " << stats.max().count() << stats.time_suffix << "\n" << "\tmin: " << stats.min().count() << stats.time_suffix << "\n" << "\tmedian: " << stats.median().count() << stats.time_suffix << "\n" << "\tmedian abs dev: " << stats.median_abs_dev().count() << stats.time_suffix << "\n" << "\tmean: " << stats.mean().count() << stats.time_suffix << "\n" << "\tstd dev: " << stats.std_dev().count() << stats.time_suffix << "\n" << "\t# of samples: " << stats.size(); return os; } #endif xtensor-stack-xsimd-541558d/include/000077500000000000000000000000001517435117100173525ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/include/xsimd/000077500000000000000000000000001517435117100204765ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/include/xsimd/arch/000077500000000000000000000000001517435117100214135ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/include/xsimd/arch/common/000077500000000000000000000000001517435117100227035ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/include/xsimd/arch/common/xsimd_common_arithmetic.hpp000066400000000000000000000272001517435117100303220ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_ARITHMETIC_HPP #define XSIMD_COMMON_ARITHMETIC_HPP #include #include #include #include "../../types/xsimd_batch_constant.hpp" #include "./xsimd_common_details.hpp" namespace xsimd { namespace kernel { using namespace types; // bitwise_lshift template ::value>*/> XSIMD_INLINE batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x << y; }, self, other); } template ::value>*/> XSIMD_INLINE batch bitwise_lshift(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(shift < bits, "Shift must be less than the number of bits in T"); return bitwise_lshift(self, shift, A {}); } // bitwise_rshift template ::value>*/> XSIMD_INLINE batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x >> y; }, self, other); } template ::value>*/> XSIMD_INLINE batch bitwise_rshift(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(shift < bits, "Shift must be less than the number of bits in T"); return bitwise_rshift(self, shift, A {}); } // decr template XSIMD_INLINE batch decr(batch const& self, requires_arch) noexcept { return self - T(1); } // decr_if template XSIMD_INLINE batch decr_if(batch const& self, Mask const& mask, requires_arch) noexcept { return select(mask, decr(self), self); } // div template ::value>> XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept -> T { return x / y; }, self, other); } // fma template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return x * y + z; } template XSIMD_INLINE batch, A> fma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // fms template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return x * y - z; } template XSIMD_INLINE batch, A> fms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // fnma template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return -x * y + z; } template XSIMD_INLINE batch, A> fnma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // fnms template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return -x * y - z; } template XSIMD_INLINE batch, A> fnms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // fmas template XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { struct even_lane { static constexpr bool get(unsigned const i, unsigned) noexcept { return (i & 1u) == 0; } }; const auto mask = make_batch_bool_constant(); return fma(x, y, select(mask, neg(z), z)); } // incr template XSIMD_INLINE batch incr(batch const& self, requires_arch) noexcept { return self + T(1); } // incr_if template XSIMD_INLINE batch incr_if(batch const& self, Mask const& mask, requires_arch) noexcept { return select(mask, incr(self), self); } // mul template ::value>*/> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept -> T { return x * y; }, self, other); } // rotl template XSIMD_INLINE batch rotl(batch const& self, STy other, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; return (self << other) | (self >> (bits - other)); } template XSIMD_INLINE batch rotl(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(count < bits, "Count amount must be less than the number of bits in T"); return bitwise_lshift(self) | bitwise_rshift(self); } // rotr template XSIMD_INLINE batch rotr(batch const& self, STy other, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; return (self >> other) | (self << (bits - other)); } template XSIMD_INLINE batch rotr(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(count < bits, "Count must be less than the number of bits in T"); return bitwise_rshift(self) | bitwise_lshift(self); } // sadd template XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { return add(self, other); // no saturated arithmetic on floating point numbers } template ::value>*/> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { auto self_pos_branch = min(std::numeric_limits::max() - other, self); auto self_neg_branch = max(std::numeric_limits::min() - other, self); return other + select(other >= 0, self_pos_branch, self_neg_branch); } else { const auto diffmax = std::numeric_limits::max() - self; const auto mindiff = min(diffmax, other); return self + mindiff; } } template XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { return add(self, other); // no saturated arithmetic on floating point numbers } // ssub template XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { return sub(self, other); // no saturated arithmetic on floating point numbers } template ::value>*/> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { return sadd(self, -other); } else { const auto diff = min(self, other); return self - diff; } } template XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { return sub(self, other); // no saturated arithmetic on floating point numbers } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/common/xsimd_common_bit.hpp000066400000000000000000000156251517435117100267570ustar00rootroot00000000000000/**************************************************************** * Partial backport of `__cpp_lib_bitops == 201907L` from C++20 * ****************************************************************/ #ifndef XSIMD_BIT_HPP #define XSIMD_BIT_HPP #if __cplusplus > 202002L #include #if __cpp_lib_bitops >= 201907L #include namespace xsimd { namespace detail { using std::countl_one; using std::countl_zero; using std::countr_one; using std::countr_zero; using std::popcount; } } #endif #else #include #include #ifdef __has_builtin #define XSIMD_HAS_BUILTIN(x) __has_builtin(x) #else #define XSIMD_HAS_BUILTIN(x) 0 #endif #ifdef _MSC_VER #include #endif namespace xsimd { namespace detail { // FIXME: We could do better by dispatching to the appropriate popcount instruction // depending on the arch. template ::value>> XSIMD_INLINE int popcount(T x) noexcept { #if XSIMD_HAS_BUILTIN(__builtin_popcountg) return __builtin_popcountg(x); #else XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { #if XSIMD_HAS_BUILTIN(__builtin_popcount) return __builtin_popcount(x); #elif defined(_MSC_VER) return __popcnt(x); #else // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64 return ((uint64_t)x * 0x200040008001ULL & 0x111111111111111ULL) % 0xf; #endif } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { #if XSIMD_HAS_BUILTIN(__builtin_popcount) return __builtin_popcount(x); #elif defined(_MSC_VER) return __popcnt16(x); #else // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64 constexpr unsigned long long msb12 = 0x1001001001001ULL; constexpr unsigned long long mask5 = 0x84210842108421ULL; unsigned int v = (unsigned int)x; return ((v & 0xfff) * msb12 & mask5) % 0x1f + (((v & 0xfff000) >> 12) * msb12 & mask5) % 0x1f; #endif } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { #if XSIMD_HAS_BUILTIN(__builtin_popcount) return __builtin_popcount(x); #elif defined(_MSC_VER) return __popcnt(x); #else // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel x = x - ((x >> 1) & (T) ~(T)0 / 3); x = (x & (T) ~(T)0 / 15 * 3) + ((x >> 2) & (T) ~(T)0 / 15 * 3); x = (x + (x >> 4)) & (T) ~(T)0 / 255 * 15; return (x * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * CHAR_BIT; #endif } else { // sizeof(T) == 8 #if XSIMD_HAS_BUILTIN(__builtin_popcountll) return __builtin_popcountll(x); #elif XSIMD_HAS_BUILTIN(__builtin_popcount) return __builtin_popcount((unsigned int)x) + __builtin_popcount((unsigned int)(x >> 32)); #elif defined(_MSC_VER) #ifdef _M_X64 return (int)__popcnt64(x); #else return (int)(__popcnt((unsigned int)x) + __popcnt((unsigned int)(x >> 32))); #endif #else // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel x = x - ((x >> 1) & (T) ~(T)0 / 3); x = (x & (T) ~(T)0 / 15 * 3) + ((x >> 2) & (T) ~(T)0 / 15 * 3); x = (x + (x >> 4)) & (T) ~(T)0 / 255 * 15; return (x * ((T) ~(T)0 / 255)) >> (sizeof(T) - 1) * CHAR_BIT; #endif } #endif } template ::value>> XSIMD_INLINE int countl_zero(T x) noexcept { #if XSIMD_HAS_BUILTIN(__builtin_clzg) return __builtin_clzg(x, (int)(sizeof(T) * CHAR_BIT)); #else if (x == 0) return sizeof(T) * CHAR_BIT; XSIMD_IF_CONSTEXPR(sizeof(T) <= 4) { #if XSIMD_HAS_BUILTIN(__builtin_clz) return __builtin_clz((unsigned int)x) - (4 - sizeof(T)) * CHAR_BIT; #elif defined(_MSC_VER) unsigned long index; _BitScanReverse(&index, (unsigned long)x); return sizeof(T) * CHAR_BIT - index - 1; #else x |= x >> 1; x |= x >> 2; x |= x >> 4; XSIMD_IF_CONSTEXPR(sizeof(T) >= 2) { x |= x >> 8; } XSIMD_IF_CONSTEXPR(sizeof(T) >= 4) { x |= x >> 16; } return sizeof(T) * CHAR_BIT - popcount(x); #endif } else { // sizeof(T) == 8 #if XSIMD_HAS_BUILTIN(__builtin_clzll) return __builtin_clzll((unsigned long long)x); #elif defined(_MSC_VER) && defined(_M_X64) unsigned long index; _BitScanReverse64(&index, (unsigned long long)x); return sizeof(T) * CHAR_BIT - index - 1; #else x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; x |= x >> 32; return sizeof(T) * CHAR_BIT - popcount(x); #endif } #endif } template ::value>> XSIMD_INLINE int countl_one(T x) noexcept { return countl_zero(T(~x)); } template ::value>> XSIMD_INLINE int countr_zero(T x) noexcept { #if XSIMD_HAS_BUILTIN(__builtin_ctzg) return __builtin_ctzg(x, (int)(sizeof(T) * CHAR_BIT)); #else if (x == 0) return sizeof(T) * CHAR_BIT; XSIMD_IF_CONSTEXPR(sizeof(T) <= 4) { #if XSIMD_HAS_BUILTIN(__builtin_ctz) return __builtin_ctz((unsigned int)x); #elif defined(_MSC_VER) unsigned long index; _BitScanForward(&index, (unsigned long)x); return index; #endif } else { // sizeof(T) == 8 #if XSIMD_HAS_BUILTIN(__builtin_ctzll) return __builtin_ctzll((unsigned long long)x); #elif defined(_MSC_VER) && defined(_M_X64) unsigned long index; _BitScanForward64(&index, (unsigned long long)x); return index; #endif } // https://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightMultLookup return popcount((T)((x & -x) - 1)); #endif } template ::value>> XSIMD_INLINE int countr_one(T x) noexcept { return countr_zero(T(~x)); } } } #endif #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/common/xsimd_common_cast.hpp000066400000000000000000000032221517435117100271210ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_CAST_HPP #define XSIMD_COMMON_CAST_HPP #include #include "../../config/xsimd_macros.hpp" #include "../../utils/xsimd_type_traits.hpp" namespace xsimd { namespace kernel { template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch::size]; x.store_aligned(&buffer[0]); using T_out = widen_t; alignas(A::alignment()) T_out out_buffer[batch::size]; for (size_t i = 0; i < batch::size; ++i) out_buffer[i] = static_cast(buffer[i]); return { batch::load_aligned(&out_buffer[0]), batch::load_aligned(&out_buffer[batch::size]) }; } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/common/xsimd_common_complex.hpp000066400000000000000000000077461517435117100276550ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_COMPLEX_HPP #define XSIMD_COMMON_COMPLEX_HPP #include #include "./xsimd_common_details.hpp" namespace xsimd { namespace kernel { using namespace types; // real template XSIMD_INLINE batch real(batch const& self, requires_arch) noexcept { return self; } template XSIMD_INLINE batch real(batch, A> const& self, requires_arch) noexcept { return self.real(); } // imag template XSIMD_INLINE batch imag(batch const& /*self*/, requires_arch) noexcept { return batch(T(0)); } template XSIMD_INLINE batch imag(batch, A> const& self, requires_arch) noexcept { return self.imag(); } // arg template XSIMD_INLINE real_batch_type_t> arg(batch const& self, requires_arch) noexcept { return atan2(imag(self), real(self)); } // conj template XSIMD_INLINE complex_batch_type_t> conj(batch const& self, requires_arch) noexcept { return { real(self), -imag(self) }; } // norm template XSIMD_INLINE real_batch_type_t> norm(batch const& self, requires_arch) noexcept { return { fma(real(self), real(self), imag(self) * imag(self)) }; } // proj template XSIMD_INLINE complex_batch_type_t> proj(batch const& self, requires_arch) noexcept { using batch_type = complex_batch_type_t>; using real_batch = typename batch_type::real_batch; using real_value_type = typename real_batch::value_type; #ifdef __FAST_MATH__ return { self }; #else auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self)); return select(cond, batch_type(constants::infinity(), copysign(real_batch(real_value_type(0)), imag(self))), batch_type(self)); #endif } template XSIMD_INLINE batch_bool isnan(batch, A> const& self, requires_arch) noexcept { return batch_bool(isnan(self.real()) || isnan(self.imag())); } template XSIMD_INLINE batch_bool isinf(batch, A> const& self, requires_arch) noexcept { return batch_bool(isinf(self.real()) || isinf(self.imag())); } template XSIMD_INLINE batch_bool isfinite(batch, A> const& self, requires_arch) noexcept { return batch_bool(isfinite(self.real()) && isfinite(self.imag())); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/common/xsimd_common_details.hpp000066400000000000000000000437761517435117100276360ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_DETAILS_HPP #define XSIMD_COMMON_DETAILS_HPP #include #include "../../math/xsimd_rem_pio2.hpp" #include "../../types/xsimd_common_arch.hpp" #include "../../types/xsimd_utils.hpp" #include "../xsimd_constants.hpp" namespace xsimd { // Forward declaration. Should we put them in a separate file? template XSIMD_INLINE batch abs(batch const& self) noexcept; template XSIMD_INLINE batch abs(batch, A> const& self) noexcept; template XSIMD_INLINE bool any(batch_bool const& self) noexcept; template XSIMD_INLINE batch atan2(batch const& self, batch const& other) noexcept; template XSIMD_INLINE batch batch_cast(batch const&, batch const& out) noexcept; template XSIMD_INLINE batch bitofsign(batch const& self) noexcept; template XSIMD_INLINE batch bitwise_cast(batch const& self) noexcept; template XSIMD_INLINE batch cos(batch const& self) noexcept; template XSIMD_INLINE batch cosh(batch const& self) noexcept; template XSIMD_INLINE batch exp(batch const& self) noexcept; template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z) noexcept; template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z) noexcept; template XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z) noexcept; template XSIMD_INLINE batch frexp(const batch& x, const batch, A>& e) noexcept; template XSIMD_INLINE batch horner(const batch& self) noexcept; template XSIMD_INLINE batch hypot(const batch& self) noexcept; template XSIMD_INLINE batch_bool is_even(batch const& self) noexcept; template XSIMD_INLINE batch_bool is_flint(batch const& self) noexcept; template XSIMD_INLINE batch_bool is_odd(batch const& self) noexcept; template XSIMD_INLINE typename batch::batch_bool_type isinf(batch const& self) noexcept; template XSIMD_INLINE typename batch::batch_bool_type isfinite(batch const& self) noexcept; template XSIMD_INLINE typename batch::batch_bool_type isnan(batch const& self) noexcept; template XSIMD_INLINE batch ldexp(const batch& x, const batch, A>& e) noexcept; template XSIMD_INLINE batch log(batch const& self) noexcept; template XSIMD_INLINE batch nearbyint(batch const& self) noexcept; template XSIMD_INLINE batch, A> nearbyint_as_int(const batch& x) noexcept; template XSIMD_INLINE T reduce_add(batch const&) noexcept; template XSIMD_INLINE T reduce_mul(batch const&) noexcept; template XSIMD_INLINE batch select(batch_bool const&, batch const&, batch const&) noexcept; template XSIMD_INLINE batch, A> select(batch_bool const&, batch, A> const&, batch, A> const&) noexcept; template XSIMD_INLINE batch sign(batch const& self) noexcept; template XSIMD_INLINE batch signnz(batch const& self) noexcept; template XSIMD_INLINE batch sin(batch const& self) noexcept; template XSIMD_INLINE batch sinh(batch const& self) noexcept; template XSIMD_INLINE std::pair, batch> sincos(batch const& self) noexcept; template XSIMD_INLINE batch sqrt(batch const& self) noexcept; template XSIMD_INLINE std::enable_if_t::value, batch> swizzle(batch const& x, batch_constant mask) noexcept; template XSIMD_INLINE batch tan(batch const& self) noexcept; template XSIMD_INLINE batch, A> to_float(batch const& self) noexcept; template XSIMD_INLINE batch, A> to_int(batch const& self) noexcept; template XSIMD_INLINE batch trunc(batch const& self) noexcept; namespace kernel { namespace detail { // Prevent -ffast-math from reassociating floating-point // arithmetic across this point. The reason string // documents *why* at each call site; unused at runtime. // // Zero-cost register constraints per target: // x86 "+x" (XMM/YMM/ZMM, also scalar float/double) // ARM "+w" (V-reg / SVE Z-reg, also scalar float/double) // PPC "+wa" (VS register, also scalar float/double) // RISC-V "+f" (F/D register, scalar float/double) // RISC-V RVV "+vr" (V register; GCC 15+ / Clang 20+) // // On unknown targets the "+m" fallback spills; it is // only emitted when the compiler can actually reassociate. template XSIMD_INLINE void reassociation_barrier(T& x, const char*) noexcept { #if XSIMD_REASSOCIATIVE_MATH && XSIMD_WITH_INLINE_ASM && !defined(__EMSCRIPTEN__) #if XSIMD_WITH_SSE2 __asm__ volatile("" : "+x"(x)); #elif XSIMD_WITH_NEON || XSIMD_WITH_SVE __asm__ volatile("" : "+w"(x)); #elif XSIMD_WITH_VSX __asm__ volatile("" : "+wa"(x)); #else __asm__ volatile("" : "+m"(x)); #endif #else (void)x; #endif } // RISC-V scalar float/double: use F/D registers instead of // spilling through "+m". These overloads also serve // emulated batches on RISC-V via the std::array overload. #if XSIMD_REASSOCIATIVE_MATH && XSIMD_WITH_INLINE_ASM && defined(__riscv) XSIMD_INLINE void reassociation_barrier(float& x, const char*) noexcept { __asm__ volatile("" : "+f"(x)); } XSIMD_INLINE void reassociation_barrier(double& x, const char*) noexcept { __asm__ volatile("" : "+f"(x)); } #endif template XSIMD_INLINE void reassociation_barrier(std::array& arr, const char* reason) noexcept { for (auto& v : arr) reassociation_barrier(v, reason); } template XSIMD_INLINE void reassociation_barrier(batch& b, const char* reason) noexcept { #if XSIMD_REASSOCIATIVE_MATH && XSIMD_WITH_RVV && XSIMD_WITH_INLINE_ASM && ((__GNUC__ >= 15) || (__clang_major__ >= 20)) __asm__ volatile("" : "+vr"(b.data.value.value)); (void)reason; #else reassociation_barrier(b.data, reason); #endif } template XSIMD_INLINE batch apply(F&& func, batch const& self, batch const& other) noexcept { constexpr std::size_t size = batch::size; alignas(A::alignment()) T self_buffer[size]; alignas(A::alignment()) T other_buffer[size]; self.store_aligned(&self_buffer[0]); other.store_aligned(&other_buffer[0]); for (std::size_t i = 0; i < size; ++i) { self_buffer[i] = func(self_buffer[i], other_buffer[i]); } return batch::load_aligned(self_buffer); } template XSIMD_INLINE batch apply_transform(F&& func, batch const& self) noexcept { static_assert(batch::size == batch::size, "Source and destination sizes must match"); constexpr std::size_t src_size = batch::size; constexpr std::size_t dest_size = batch::size; alignas(A::alignment()) T self_buffer[src_size]; alignas(A::alignment()) U other_buffer[dest_size]; self.store_aligned(&self_buffer[0]); for (std::size_t i = 0; i < src_size; ++i) { other_buffer[i] = func(self_buffer[i]); } return batch::load_aligned(other_buffer); } } // some common fast_cast conversion namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } // Provide a common uint32_t -> float cast only if we have a // non-common int32_t -> float fast_cast template const&>(), std::declval const&>(), A {}))> XSIMD_INLINE batch fast_cast(batch const& v, batch const&, requires_arch) noexcept { // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse batch msk_lo(0xFFFF); batch cnst65536f(65536.0f); auto v_lo = batch_cast(v & msk_lo); /* extract the 16 lowest significant bits of self */ auto v_hi = batch_cast(v >> 16); /* 16 most significant bits of v */ auto v_lo_flt = batch_cast(v_lo); /* No rounding */ auto v_hi_flt = batch_cast(v_hi); /* No rounding */ v_hi_flt = cnst65536f * v_hi_flt; /* No rounding */ return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ } // Provide a common float -> uint32_t cast only if we have a // non-common float -> int32_t fast_cast template const&>(), std::declval const&>(), A {}))> XSIMD_INLINE batch fast_cast(batch const& v, batch const&, requires_arch) noexcept { auto is_large = v >= batch(1u << 31); auto small_v = bitwise_cast(batch_cast(v)); auto large_v = bitwise_cast( batch_cast(v - batch(1u << 31)) ^ batch(1u << 31)); return bitwise_cast(select(is_large, large_v, small_v)); } } namespace detail { // Generic conversion handling machinery. Each architecture must define // conversion function when such conversions exits in the form of // intrinsic. Then we use that information to automatically decide whether // to use scalar or vector conversion when doing load / store / batch_cast struct with_fast_conversion { }; struct with_slow_conversion { }; template struct conversion_type_impl { using type = with_slow_conversion; }; using xsimd::detail::void_t; template struct conversion_type_impl&>(), std::declval&>(), std::declval()))>> { using type = with_fast_conversion; }; template using conversion_type = typename conversion_type_impl::type; } namespace detail { /* origin: boost/simdfunction/horn.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE B coef() noexcept { using value_type = typename B::value_type; return B(bit_cast(as_unsigned_integer_t(c))); } template XSIMD_INLINE B horner(const B&) noexcept { return B(typename B::value_type(0.)); } template XSIMD_INLINE B horner(const B&) noexcept { return coef(); } template XSIMD_INLINE B horner(const B& self) noexcept { return fma(self, horner(self), coef()); } /* origin: boost/simdfunction/horn1.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE B horner1(const B&) noexcept { return B(1.); } template XSIMD_INLINE B horner1(const B& x) noexcept { return x + detail::coef(); } template XSIMD_INLINE B horner1(const B& x) noexcept { return fma(x, horner1(x), detail::coef()); } } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/common/xsimd_common_logical.hpp000066400000000000000000000232171517435117100276070ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_LOGICAL_HPP #define XSIMD_COMMON_LOGICAL_HPP #include "./xsimd_common_bit.hpp" #include "./xsimd_common_details.hpp" #include namespace xsimd { namespace kernel { using namespace types; // count template XSIMD_INLINE size_t count(batch_bool const& self, requires_arch) noexcept { return xsimd::detail::popcount(self.mask()); } template XSIMD_INLINE size_t countl_zero(batch_bool const& self, requires_arch) noexcept { constexpr size_t unused_bits = 64 - batch_bool::size; constexpr uint64_t lower_mask = batch_bool::size < 64 ? ((uint64_t)1 << (batch_bool::size % 64)) - 1 : (uint64_t)-1; return xsimd::detail::countl_zero(self.mask() & lower_mask) - unused_bits; } template XSIMD_INLINE size_t countl_one(batch_bool const& self, requires_arch) noexcept { constexpr size_t unused_bits = 64 - batch_bool::size; constexpr uint64_t upper_mask = batch_bool::size < 64 ? ~(((uint64_t)1 << (batch_bool::size % 64)) - 1) : (uint64_t)0; return xsimd::detail::countl_one(self.mask() | upper_mask) - unused_bits; } template XSIMD_INLINE size_t countr_zero(batch_bool const& self, requires_arch) noexcept { constexpr uint64_t stop = batch_bool::size < 64 ? (uint64_t)1 << (batch_bool::size % 64) : 0; return xsimd::detail::countr_zero(self.mask() | stop); } template XSIMD_INLINE size_t countr_one(batch_bool const& self, requires_arch) noexcept { constexpr uint64_t stop = batch_bool::size < 64 ? ~((uint64_t)1 << (batch_bool::size % 64)) : (uint64_t)-1; return xsimd::detail::countr_one(self.mask() & stop); } // from mask template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) bool buffer[batch_bool::size]; // This is inefficient and should never be called. for (size_t i = 0; i < batch_bool::size; ++i) buffer[i] = mask & (1ull << i); return batch_bool::load_aligned(buffer); } // ge template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return other <= self; } // gt template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return other < self; } // is_even template XSIMD_INLINE batch_bool is_even(batch const& self, requires_arch) noexcept { return is_flint(self * T(0.5)); } // is_flint template XSIMD_INLINE batch_bool is_flint(batch const& self, requires_arch) noexcept { auto frac = select(isnan(self - self), constants::nan>(), self - trunc(self)); return frac == T(0.); } // is_odd template XSIMD_INLINE batch_bool is_odd(batch const& self, requires_arch) noexcept { return is_even(self - T(1.)); } // isinf template ::value>> XSIMD_INLINE batch_bool isinf(batch const&, requires_arch) noexcept { return batch_bool(false); } template XSIMD_INLINE batch_bool isinf(batch const& self, requires_arch) noexcept { #ifdef __FAST_MATH__ (void)self; return { false }; #else return abs(self) == std::numeric_limits::infinity(); #endif } template XSIMD_INLINE batch_bool isinf(batch const& self, requires_arch) noexcept { #ifdef __FAST_MATH__ (void)self; return { false }; #else return abs(self) == std::numeric_limits::infinity(); #endif } // isfinite template ::value>> XSIMD_INLINE batch_bool isfinite(batch const&, requires_arch) noexcept { return batch_bool(true); } template XSIMD_INLINE batch_bool isfinite(batch const& self, requires_arch) noexcept { return (self - self) == 0.f; } template XSIMD_INLINE batch_bool isfinite(batch const& self, requires_arch) noexcept { return (self - self) == 0.; } // isnan template ::value>> XSIMD_INLINE batch_bool isnan(batch const&, requires_arch) noexcept { return batch_bool(false); } // le template ::value>> XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return (self < other) || (self == other); } // neq template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return !(other == self); } // logical_and template XSIMD_INLINE batch logical_and(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x && y; }, self, other); } // logical_or template XSIMD_INLINE batch logical_or(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x || y; }, self, other); } // mask template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { alignas(A::alignment()) bool buffer[batch_bool::size]; self.store_aligned(buffer); // This is inefficient and should never be called. uint64_t res = 0; for (size_t i = 0; i < batch_bool::size; ++i) if (buffer[i]) res |= 1ul << i; return res; } // select namespace detail { template using is_batch_bool_register_same = std::is_same::register_type, typename batch::register_type>; } template ::value, int> = 3> XSIMD_INLINE batch_bool select(batch_bool const& cond, batch_bool const& true_br, batch_bool const& false_br, requires_arch) { using register_type = typename batch_bool::register_type; // Do not cast, but rather reinterpret the masks as batches. const auto true_v = batch { static_cast(true_br) }; const auto false_v = batch { static_cast(false_br) }; return batch_bool { select(cond, true_v, false_v) }; } template ::value, int> = 3> XSIMD_INLINE batch_bool select(batch_bool const& cond, batch_bool const& true_br, batch_bool const& false_br, requires_arch) { return (true_br & cond) | (bitwise_andnot(false_br, cond)); } template XSIMD_INLINE batch_bool select(batch_bool_constant const& cond, batch_bool const& true_br, batch_bool const& false_br, requires_arch) { return (true_br & cond) | (false_br & ~cond); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/common/xsimd_common_math.hpp000066400000000000000000003477651517435117100271470ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_MATH_HPP #define XSIMD_COMMON_MATH_HPP #include "../xsimd_scalar.hpp" #include "./xsimd_common_details.hpp" #include "./xsimd_common_trigo.hpp" #include namespace xsimd { namespace kernel { using namespace types; // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { if (std::is_unsigned::value) return self; else { auto sign = bitofsign(self); auto inv = self ^ sign; return inv - sign; } } template XSIMD_INLINE batch abs(batch, A> const& z, requires_arch) noexcept { return hypot(z.real(), z.imag()); } // avg namespace detail { template XSIMD_INLINE batch avg(batch const& x, batch const& y, std::true_type, std::false_type) noexcept { return (x & y) + ((x ^ y) >> 1); } template XSIMD_INLINE batch avg(batch const& x, batch const& y, std::true_type, std::true_type) noexcept { // Inspired by // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c auto t = (x & y) + ((x ^ y) >> 1); auto t_u = bitwise_cast>(t); auto avg = t + (bitwise_cast(t_u >> (8 * sizeof(T) - 1)) & (x ^ y)); return avg; } template XSIMD_INLINE batch avg(batch const& x, batch const& y, std::false_type, std::true_type) noexcept { return (x + y) / 2; } } template XSIMD_INLINE batch avg(batch const& x, batch const& y, requires_arch) noexcept { return detail::avg(x, y, typename std::is_integral::type {}, typename std::is_signed::type {}); } // avgr namespace detail { template XSIMD_INLINE batch avgr(batch const& x, batch const& y, std::true_type) noexcept { constexpr unsigned shift = 8 * sizeof(T) - 1; auto adj = std::is_signed::value ? ((x ^ y) & 0x1) : (((x ^ y) << shift) >> shift); return ::xsimd::kernel::avg(x, y, A {}) + adj; } template XSIMD_INLINE batch avgr(batch const& x, batch const& y, std::false_type) noexcept { return ::xsimd::kernel::avg(x, y, A {}); } } template XSIMD_INLINE batch avgr(batch const& x, batch const& y, requires_arch) noexcept { return detail::avgr(x, y, typename std::is_integral::type {}); } // batch_cast template XSIMD_INLINE batch batch_cast(batch const& self, batch const&, requires_arch) noexcept { return self; } namespace detail { template XSIMD_INLINE batch batch_cast(batch const& self, batch const& out, requires_arch, with_fast_conversion) noexcept { return fast_cast(self, out, A {}); } #if defined(__clang__) || __GNUC__ template XSIMD_INLINE batch batch_cast(batch const& self, batch const&, requires_arch, with_slow_conversion) noexcept __attribute__((no_sanitize("undefined"))); #endif template XSIMD_INLINE batch batch_cast(batch const& self, batch const&, requires_arch, with_slow_conversion) noexcept { static_assert(!std::is_same::value, "there should be no conversion for this type combination"); using batch_type_in = batch; using batch_type_out = batch; static_assert(batch_type_in::size == batch_type_out::size, "compatible sizes"); alignas(A::alignment()) T_in buffer_in[batch_type_in::size]; alignas(A::alignment()) T_out buffer_out[batch_type_out::size]; self.store_aligned(&buffer_in[0]); for (size_t i = 0; i < batch_type_in::size; ++i) buffer_out[i] = static_cast(buffer_in[i]); return batch_type_out::load_aligned(buffer_out); } } template XSIMD_INLINE batch batch_cast(batch const& self, batch const& out, requires_arch) noexcept { return detail::batch_cast(self, out, A {}, detail::conversion_type {}); } // bitofsign template XSIMD_INLINE batch bitofsign(batch const& self, requires_arch) noexcept { static_assert(std::is_integral::value, "int type implementation"); if (std::is_unsigned::value) return batch(0); else return self >> (T)(8 * sizeof(T) - 1); } template XSIMD_INLINE batch bitofsign(batch const& self, requires_arch) noexcept { return self & constants::signmask>(); } template XSIMD_INLINE batch bitofsign(batch const& self, requires_arch) noexcept { return self & constants::signmask>(); } // bitwise_cast template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return self; } // cbrt /* origin: boost/simd/arch/common/simd/function/cbrt.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch cbrt(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type z = abs(self); #ifndef XSIMD_NO_DENORMALS auto denormal = z < constants::smallestposval(); z = select(denormal, z * constants::twotonmb(), z); batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); #endif const batch_type CBRT2(bit_cast(0x3fa14518)); const batch_type CBRT4(bit_cast(0x3fcb2ff5)); const batch_type CBRT2I(bit_cast(0x3f4b2ff5)); const batch_type CBRT4I(bit_cast(0x3f214518)); using i_type = as_integer_t; i_type e; batch_type x = frexp(z, e); x = detail::horner(x); auto flag = e >= i_type(0); i_type e1 = abs(e); i_type rem = e1; e1 /= i_type(3); rem -= e1 * i_type(3); e = e1 * sign(e); const batch_type cbrt2 = select(batch_bool_cast(flag), CBRT2, CBRT2I); const batch_type cbrt4 = select(batch_bool_cast(flag), CBRT4, CBRT4I); batch_type fact = select(batch_bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); fact = select(batch_bool_cast(rem == i_type(2)), cbrt4, fact); x = ldexp(x * fact, e); x -= (x - z / (x * x)) * batch_type(1.f / 3.f); #ifndef XSIMD_NO_DENORMALS x = (x | bitofsign(self)) * f; #else x = x | bitofsign(self); #endif #ifndef XSIMD_NO_INFINITIES return select(self == batch_type(0.) || isinf(self), self, x); #else return select(self == batch_type(0.), self, x); #endif } template XSIMD_INLINE batch cbrt(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type z = abs(self); #ifndef XSIMD_NO_DENORMALS auto denormal = z < constants::smallestposval(); z = select(denormal, z * constants::twotonmb(), z); batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); #endif const batch_type CBRT2(bit_cast(int64_t(0x3ff428a2f98d728b))); const batch_type CBRT4(bit_cast(int64_t(0x3ff965fea53d6e3d))); const batch_type CBRT2I(bit_cast(int64_t(0x3fe965fea53d6e3d))); const batch_type CBRT4I(bit_cast(int64_t(0x3fe428a2f98d728b))); using i_type = as_integer_t; i_type e; batch_type x = frexp(z, e); x = detail::horner(x); auto flag = e >= typename i_type::value_type(0); i_type e1 = abs(e); i_type rem = e1; e1 /= i_type(3); rem -= e1 * i_type(3); e = e1 * sign(e); const batch_type cbrt2 = select(batch_bool_cast(flag), CBRT2, CBRT2I); const batch_type cbrt4 = select(batch_bool_cast(flag), CBRT4, CBRT4I); batch_type fact = select(batch_bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); fact = select(batch_bool_cast(rem == i_type(2)), cbrt4, fact); x = ldexp(x * fact, e); x -= (x - z / (x * x)) * batch_type(1. / 3.); x -= (x - z / (x * x)) * batch_type(1. / 3.); #ifndef XSIMD_NO_DENORMALS x = (x | bitofsign(self)) * f; #else x = x | bitofsign(self); #endif #ifndef XSIMD_NO_INFINITIES return select(self == batch_type(0.) || isinf(self), self, x); #else return select(self == batch_type(0.), self, x); #endif } // clip template XSIMD_INLINE batch clip(batch const& self, batch const& lo, batch const& hi, requires_arch) noexcept { return min(hi, max(self, lo)); } // copysign template ::value>> XSIMD_INLINE batch copysign(batch const& self, batch const& other, requires_arch) noexcept { return abs(self) | bitofsign(other); } // erf namespace detail { /* origin: boost/simd/arch/common/detail/common/erf_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct erf_kernel; template struct erf_kernel> { using batch_type = batch; // computes erf(a0)/a0 // x is sqr(a0) and 0 <= abs(a0) <= 2/3 static XSIMD_INLINE batch_type erf1(const batch_type& x) noexcept { return detail::horner(x); } // computes erfc(x)*exp(sqr(x)) // x >= 2/3 static XSIMD_INLINE batch_type erfc2(const batch_type& x) noexcept { return detail::horner(x); } static XSIMD_INLINE batch_type erfc3(const batch_type& x) noexcept { return (batch_type(1.) - x) * detail::horner(x); } }; template struct erf_kernel> { using batch_type = batch; // computes erf(a0)/a0 // x is sqr(a0) and 0 <= abs(a0) <= 0.65 static XSIMD_INLINE batch_type erf1(const batch_type& x) noexcept { return detail::horner(x) / detail::horner(x); } // computes erfc(x)*exp(x*x) // 0.65 <= abs(x) <= 2.2 static XSIMD_INLINE batch_type erfc2(const batch_type& x) noexcept { return detail::horner(x) / detail::horner(x); } // computes erfc(x)*exp(x*x) // 2.2 <= abs(x) <= 6 static XSIMD_INLINE batch_type erfc3(const batch_type& x) noexcept { return detail::horner(x) / detail::horner(x); } // computes erfc(rx)*exp(rx*rx) // x >= 6 rx = 1/x static XSIMD_INLINE batch_type erfc4(const batch_type& x) noexcept { return detail::horner(x); } }; } /* origin: boost/simd/arch/common/simd/function/erf.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch erf(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type r1(0.); auto test1 = x < batch_type(2.f / 3.f); if (any(test1)) { r1 = self * detail::erf_kernel::erf1(x * x); if (all(test1)) return r1; } batch_type z = x / (batch_type(1.) + x); z -= batch_type(0.4f); batch_type r2 = batch_type(1.) - exp(-x * x) * detail::erf_kernel::erfc2(z); r2 = select(self < batch_type(0.), -r2, r2); r1 = select(test1, r1, r2); #ifndef XSIMD_NO_INFINITIES r1 = select(xsimd::isinf(self), sign(self), r1); #endif return r1; } template XSIMD_INLINE batch erf(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type xx = x * x; batch_type lim1(0.65); batch_type lim2(2.2); auto test1 = x < lim1; batch_type r1(0.); if (any(test1)) { r1 = self * detail::erf_kernel::erf1(xx); if (all(test1)) return r1; } auto test2 = x < lim2; auto test3 = test2 && !test1; batch_type ex = exp(-xx); if (any(test3)) { batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc2(x); batch_type r2 = select(self < batch_type(0.), -z, z); r1 = select(test1, r1, r2); if (all(test1 || test3)) return r1; } batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc3(x); z = select(self < batch_type(0.), -z, z); #ifndef XSIMD_NO_INFINITIES z = select(xsimd::isinf(self), sign(self), z); #endif return select(test2, r1, z); } // erfc template XSIMD_INLINE batch erfc(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto test0 = self < batch_type(0.); batch_type r1(0.); auto test1 = 3.f * x < 2.f; batch_type z = x / (batch_type(1.) + x); if (any(test1)) { r1 = detail::erf_kernel::erfc3(z); if (all(test1)) return select(test0, batch_type(2.) - r1, r1); } z -= batch_type(0.4f); batch_type r2 = exp(-x * x) * detail::erf_kernel::erfc2(z); r1 = select(test1, r1, r2); #ifndef XSIMD_NO_INFINITIES r1 = select(x == constants::infinity(), batch_type(0.), r1); #endif return select(test0, batch_type(2.) - r1, r1); } template XSIMD_INLINE batch erfc(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type xx = x * x; batch_type lim1(0.65); batch_type lim2(2.2); auto test0 = self < batch_type(0.); auto test1 = x < lim1; batch_type r1(0.); if (any(test1)) { r1 = batch_type(1.) - x * detail::erf_kernel::erf1(xx); if (all(test1)) return select(test0, batch_type(2.) - r1, r1); } auto test2 = x < lim2; auto test3 = test2 && !test1; batch_type ex = exp(-xx); if (any(test3)) { batch_type z = ex * detail::erf_kernel::erfc2(x); r1 = select(test1, r1, z); if (all(test1 || test3)) return select(test0, batch_type(2.) - r1, r1); } batch_type z = ex * detail::erf_kernel::erfc3(x); r1 = select(test2, r1, z); #ifndef XSIMD_NO_INFINITIES r1 = select(x == constants::infinity(), batch_type(0.), r1); #endif return select(test0, batch_type(2.) - r1, r1); } // estrin namespace detail { template struct estrin { B x; template XSIMD_INLINE B operator()(const Ts&... coefs) noexcept { return eval(coefs...); } private: XSIMD_INLINE B eval(const B& c0) noexcept { return c0; } XSIMD_INLINE B eval(const B& c0, const B& c1) noexcept { return fma(x, c1, c0); } template XSIMD_INLINE B eval(std::index_sequence, const Tuple& tuple) { return estrin { x * x }(std::get(tuple)...); } template XSIMD_INLINE B eval(const std::tuple& tuple) noexcept { return eval(std::make_index_sequence(), tuple); } template XSIMD_INLINE B eval(const std::tuple& tuple, const B& c0) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0)))); } template XSIMD_INLINE B eval(const std::tuple& tuple, const B& c0, const B& c1) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1)))); } template XSIMD_INLINE B eval(const std::tuple& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...); } template XSIMD_INLINE B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept { return eval(std::make_tuple(eval(c0, c1)), coefs...); } }; } template XSIMD_INLINE batch estrin(const batch& self) noexcept { using batch_type = batch; return detail::estrin { self }(detail::coef()...); } // exp /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { enum exp_reduction_tag { exp_tag, exp2_tag, exp10_tag }; template struct exp_reduction_base; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return constants::maxlog(); } static constexpr B minlog() noexcept { return constants::minlog(); } }; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return constants::maxlog10(); } static constexpr B minlog() noexcept { return constants::minlog10(); } }; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return constants::maxlog2(); } static constexpr B minlog() noexcept { return constants::minlog2(); } }; template struct exp_reduction; template struct exp_reduction : exp_reduction_base, exp_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { batch_type y = detail::horner(x); return ++fma(y, x * x, x); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog_2() * a); detail::reassociation_barrier(k, "compensated exp range reduction"); x = fnma(k, constants::log_2hi(), a); detail::reassociation_barrier(x, "compensated exp range reduction"); x = fnma(k, constants::log_2lo(), x); return k; } }; template struct exp_reduction : exp_reduction_base, exp10_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { return ++(detail::horner(x) * x); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog10_2() * a); detail::reassociation_barrier(k, "compensated exp10 range reduction"); x = fnma(k, constants::log10_2hi(), a); detail::reassociation_barrier(x, "compensated exp10 range reduction"); x -= k * constants::log10_2lo(); return k; } }; template struct exp_reduction : exp_reduction_base, exp2_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { batch_type y = detail::horner(x); return ++fma(y, x * x, x * constants::log_2()); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(a); detail::reassociation_barrier(k, "compensated exp2 range reduction"); x = (a - k); return k; } }; template struct exp_reduction : exp_reduction_base, exp_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { batch_type t = x * x; return fnma(t, detail::horner(t), x); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog_2() * a); detail::reassociation_barrier(k, "compensated exp range reduction"); hi = fnma(k, constants::log_2hi(), a); detail::reassociation_barrier(hi, "compensated exp range reduction"); lo = k * constants::log_2lo(); x = hi - lo; return k; } static XSIMD_INLINE batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) noexcept { return batch_type(1.) - (((lo - (x * c) / (batch_type(2.) - c)) - hi)); } }; template struct exp_reduction : exp_reduction_base, exp10_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { batch_type xx = x * x; batch_type px = x * detail::horner(xx); batch_type x2 = px / (detail::horner1(xx) - px); return ++(x2 + x2); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog10_2() * a); detail::reassociation_barrier(k, "compensated exp10 range reduction"); x = fnma(k, constants::log10_2hi(), a); detail::reassociation_barrier(x, "compensated exp10 range reduction"); x = fnma(k, constants::log10_2lo(), x); return k; } static XSIMD_INLINE batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) noexcept { return c; } }; template struct exp_reduction : exp_reduction_base, exp2_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { batch_type t = x * x; return fnma(t, detail::horner(t), x); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept { batch_type k = nearbyint(a); x = (a - k) * constants::log_2(); detail::reassociation_barrier(x, "keep reduced exponent ordered before finalize"); return k; } static XSIMD_INLINE batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) noexcept { return batch_type(1.) + x + x * c / (batch_type(2.) - c); } }; template XSIMD_INLINE batch exp(batch const& self) noexcept { using batch_type = batch; using reducer_t = exp_reduction; batch_type x; batch_type k = reducer_t::reduce(self, x); x = reducer_t::approx(x); x = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(x, to_int(k))); #ifndef __FAST_MATH__ x = select(self >= reducer_t::maxlog(), constants::infinity(), x); #endif return x; } template XSIMD_INLINE batch exp(batch const& self) noexcept { using batch_type = batch; using reducer_t = exp_reduction; batch_type hi, lo, x; batch_type k = reducer_t::reduce(self, hi, lo, x); batch_type c = reducer_t::approx(x); c = reducer_t::finalize(x, c, hi, lo); c = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(c, to_int(k))); #ifndef __FAST_MATH__ c = select(self >= reducer_t::maxlog(), constants::infinity(), c); #endif return c; } } template XSIMD_INLINE batch exp(batch const& self, requires_arch) noexcept { return detail::exp(self); } template XSIMD_INLINE batch, A> exp(batch, A> const& self, requires_arch) noexcept { using batch_type = batch, A>; auto isincos = sincos(self.imag()); return exp(self.real()) * batch_type(std::get<1>(isincos), std::get<0>(isincos)); } // exp10 template XSIMD_INLINE batch exp10(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type out = detail::exp(self); detail::reassociation_barrier(out, "prevent folding exp10 for literal inputs"); return out; } // exp2 template XSIMD_INLINE batch exp2(batch const& self, requires_arch) noexcept { return detail::exp(self); } // expm1 namespace detail { /* origin: boost/simd/arch/common/detail/common/expm1_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template static XSIMD_INLINE batch expm1(const batch& a) noexcept { using batch_type = batch; batch_type k = nearbyint(constants::invlog_2() * a); batch_type x = fnma(k, constants::log_2hi(), a); x = fnma(k, constants::log_2lo(), x); batch_type hx = x * batch_type(0.5); batch_type hxs = x * hx; batch_type r = detail::horner(hxs); batch_type t = fnma(r, hx, batch_type(3.)); batch_type e = hxs * ((r - t) / (batch_type(6.) - x * t)); e = fms(x, e, hxs); using i_type = as_integer_t; i_type ik = to_int(k); batch_type two2mk = ::xsimd::bitwise_cast((constants::maxexponent() - ik) << constants::nmb()); batch_type y = batch_type(1.) - two2mk - (e - x); return ldexp(y, ik); } template static XSIMD_INLINE batch expm1(const batch& a) noexcept { using batch_type = batch; batch_type k = nearbyint(constants::invlog_2() * a); batch_type hi = fnma(k, constants::log_2hi(), a); batch_type lo = k * constants::log_2lo(); batch_type x = hi - lo; batch_type hxs = x * x * batch_type(0.5); batch_type r = detail::horner(hxs); batch_type t = batch_type(3.) - r * batch_type(0.5) * x; batch_type e = hxs * ((r - t) / (batch_type(6) - x * t)); batch_type c = (hi - x) - lo; e = (x * (e - c) - c) - hxs; using i_type = as_integer_t; i_type ik = to_int(k); batch_type two2mk = ::xsimd::bitwise_cast((constants::maxexponent() - ik) << constants::nmb()); batch_type ct1 = batch_type(1.) - two2mk - (e - x); batch_type ct2 = ++(x - (e + two2mk)); batch_type y = select(k < batch_type(20.), ct1, ct2); return ldexp(y, ik); } } template XSIMD_INLINE batch expm1(batch const& self, requires_arch) noexcept { using batch_type = batch; auto x = detail::expm1(self); #ifndef __FAST_MATH__ x = select(self > constants::maxlog(), constants::infinity(), x); #endif return select(self < constants::logeps(), batch_type(-1.), x); } template XSIMD_INLINE batch, A> expm1(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch isin = sin(z.imag()); real_batch rem1 = expm1(z.real()); real_batch re = rem1 + 1.; real_batch si = sin(z.imag() * 0.5); return { rem1 - 2. * re * si * si, re * isin }; } // polar template XSIMD_INLINE batch, A> polar(const batch& r, const batch& theta, requires_arch) noexcept { auto sincosTheta = sincos(theta); return { r * sincosTheta.second, r * sincosTheta.first }; } // fdim template XSIMD_INLINE batch fdim(batch const& self, batch const& other, requires_arch) noexcept { return fmax(batch(0), self - other); } // fmod template XSIMD_INLINE batch fmod(batch const& self, batch const& other, requires_arch) noexcept { return fnma(trunc(self / other), other, self); } // frexp /* origin: boost/simd/arch/common/simd/function/ifrexp.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch frexp(const batch& self, batch, A>& exp, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; i_type m1f = constants::mask1frexp(); i_type r1 = m1f & ::xsimd::bitwise_cast(self); batch_type x = self & ::xsimd::bitwise_cast(~m1f); exp = (r1 >> constants::nmb()) - constants::maxexponentm1(); exp = select(batch_bool_cast(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0))); return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast(constants::mask2frexp()), batch_type(0.)); } // from bool template XSIMD_INLINE batch from_bool(batch_bool const& self, requires_arch) noexcept { return batch((typename batch::register_type)self.data) & batch(1); } // horner template XSIMD_INLINE batch horner(const batch& self) noexcept { return detail::horner, Coefs...>(self); } // hypot template XSIMD_INLINE batch hypot(batch const& self, batch const& other, requires_arch) noexcept { return sqrt(fma(self, self, other * other)); } // ipow template XSIMD_INLINE batch ipow(batch const& self, ITy other, requires_arch) noexcept { return ::xsimd::detail::ipow(self, other); } // ldexp /* origin: boost/simd/arch/common/simd/function/ldexp.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { using batch_type = batch; using itype = as_integer_t; itype ik = other + constants::maxexponent(); ik = ik << constants::nmb(); return self * ::xsimd::bitwise_cast(ik); } // lgamma template XSIMD_INLINE batch lgamma(batch const& self, requires_arch) noexcept; namespace detail { /* origin: boost/simd/arch/common/detail/common/gammaln_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template static XSIMD_INLINE batch gammalnB(const batch& x) noexcept { return horner, 0x3ed87730, // 4.227843421859038E-001 0x3ea51a64, // 3.224669577325661E-001, 0xbd89f07e, // -6.735323259371034E-002, 0x3ca89ed8, // 2.058355474821512E-002, 0xbbf164fd, // -7.366775108654962E-003, 0x3b3ba883, // 2.863437556468661E-003, 0xbaabeab1, // -1.311620815545743E-003, 0x3a1ebb94 // 6.055172732649237E-004 >(x); } template static XSIMD_INLINE batch gammalnC(const batch& x) noexcept { return horner, 0xbf13c468, // -5.772156501719101E-001 0x3f528d34, // 8.224670749082976E-001, 0xbecd27a8, // -4.006931650563372E-001, 0x3e8a898b, // 2.705806208275915E-001, 0xbe53c04f, // -2.067882815621965E-001, 0x3e2d4dab, // 1.692415923504637E-001, 0xbe22d329, // -1.590086327657347E-001, 0x3e0c3c4f // 1.369488127325832E-001 >(x); } template static XSIMD_INLINE batch gammaln2(const batch& x) noexcept { return horner, 0x3daaaa94, // 8.333316229807355E-002f 0xbb358701, // -2.769887652139868E-003f, 0x3a31fd69 // 6.789774945028216E-004f >(x); } template static XSIMD_INLINE batch gammaln1(const batch& x) noexcept { return horner, 0xc12a0c675418055eull, // -8.53555664245765465627E5 0xc13a45890219f20bull, // -1.72173700820839662146E6, 0xc131bc82f994db51ull, // -1.16237097492762307383E6, 0xc1143d73f89089e5ull, // -3.31612992738871184744E5, 0xc0e2f234355bb93eull, // -3.88016315134637840924E4, 0xc09589018ff36761ull // -1.37825152569120859100E3 >(x) / horner, 0xc13ece4b6a11e14aull, // -2.01889141433532773231E6 0xc1435255892ff34cull, // -2.53252307177582951285E6, 0xc131628671950043ull, // -1.13933444367982507207E6, 0xc10aeb84b9744c9bull, // -2.20528590553854454839E5, 0xc0d0aa0d7b89d757ull, // -1.70642106651881159223E4, 0xc075fd0d1cf312b2ull, // -3.51815701436523470549E2, 0x3ff0000000000000ull // 1.00000000000000000000E0 >(x); } template static XSIMD_INLINE batch gammalnA(const batch& x) noexcept { return horner, 0x3fb555555555554bull, // 8.33333333333331927722E-2 0xbf66c16c16b0a5a1ull, // -2.77777777730099687205E-3, 0x3f4a019f20dc5ebbull, // 7.93650340457716943945E-4, 0xbf437fbdb580e943ull, // -5.95061904284301438324E-4, 0x3f4a985027336661ull // 8.11614167470508450300E-4 >(x); } /* origin: boost/simd/arch/common/simd/function/gammaln.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct lgamma_impl; template struct lgamma_impl> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& a) noexcept { auto inf_result = (a <= batch_type(0.)) && is_flint(a); batch_type x = select(inf_result, constants::nan(), a); batch_type q = abs(x); #ifndef XSIMD_NO_INFINITIES inf_result = (x == constants::infinity()) || inf_result; #endif auto ltza = a < batch_type(0.); batch_type r(0); batch_type r1 = other(q); if (any(ltza)) { #ifdef __FAST_MATH__ r = negative(q, r1); #else r = select(inf_result, constants::infinity(), negative(q, r1)); #endif if (all(ltza)) return r; } batch_type r2 = select(ltza, r, r1); #ifdef __FAST_MATH__ return r2; #else return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); #endif } private: static XSIMD_INLINE batch_type negative(const batch_type& q, const batch_type& w) noexcept { batch_type p = floor(q); batch_type z = q - p; auto test2 = z < batch_type(0.5); z = select(test2, z - batch_type(1.), z); z = q * sin(z, trigo_pi_tag()); return -log(constants::invpi() * abs(z)) - w; } static XSIMD_INLINE batch_type other(const batch_type& x) noexcept { auto xlt650 = (x < batch_type(6.5)); batch_type r0x = x; batch_type r0z = x; batch_type r0s = batch_type(1.); batch_type r1 = batch_type(0.); batch_type p = constants::nan(); if (any(xlt650)) { batch_type z = batch_type(1.); batch_type tx = select(xlt650, x, batch_type(0.)); batch_type nx = batch_type(0.); const batch_type _075 = batch_type(0.75); const batch_type _150 = batch_type(1.50); const batch_type _125 = batch_type(1.25); const batch_type _250 = batch_type(2.50); auto xge150 = (x >= _150); auto txgt250 = (tx > _250); // x >= 1.5 while (any(xge150 && txgt250)) { nx = select(txgt250, nx - batch_type(1.), nx); tx = select(txgt250, x + nx, tx); z = select(txgt250, z * tx, z); txgt250 = (tx > _250); } r0x = select(xge150, x + nx - batch_type(2.), x); r0z = select(xge150, z, r0z); r0s = select(xge150, batch_type(1.), r0s); // x >= 1.25 && x < 1.5 auto xge125 = (x >= _125); auto xge125t = xge125 && !xge150; if (any(xge125)) { r0x = select(xge125t, x - batch_type(1.), r0x); r0z = select(xge125t, z * x, r0z); r0s = select(xge125t, batch_type(-1.), r0s); } // x >= 0.75 && x < 1.5 batch_bool kernelC(false); auto xge075 = (x >= _075); auto xge075t = xge075 && !xge125; if (any(xge075t)) { kernelC = xge075t; r0x = select(xge075t, x - batch_type(1.), x); r0z = select(xge075t, batch_type(1.), r0z); r0s = select(xge075t, batch_type(-1.), r0s); p = gammalnC(r0x); } // tx < 1.5 && x < 0.75 auto txlt150 = (tx < _150) && !xge075; if (any(txlt150)) { auto orig = txlt150; while (any(txlt150)) { z = select(txlt150, z * tx, z); nx = select(txlt150, nx + batch_type(1.), nx); tx = select(txlt150, x + nx, tx); txlt150 = (tx < _150) && !xge075; } r0x = select(orig, r0x + nx - batch_type(2.), r0x); r0z = select(orig, z, r0z); r0s = select(orig, batch_type(-1.), r0s); } p = select(kernelC, p, gammalnB(r0x)); if (all(xlt650)) return fma(r0x, p, r0s * log(abs(r0z))); } r0z = select(xlt650, abs(r0z), x); batch_type m = log(r0z); r1 = fma(r0x, p, r0s * m); batch_type r2 = fma(x - batch_type(0.5), m, constants::logsqrt2pi() - x); r2 += gammaln2(batch_type(1.) / (x * x)) / x; return select(xlt650, r1, r2); } }; template struct lgamma_impl> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& a) noexcept { auto inf_result = (a <= batch_type(0.)) && is_flint(a); batch_type x = select(inf_result, constants::nan(), a); batch_type q = abs(x); #ifndef XSIMD_NO_INFINITIES inf_result = (q == constants::infinity()); #endif auto test = (a < batch_type(-34.)); batch_type r = constants::nan(); if (any(test)) { r = large_negative(q); if (all(test)) return select(inf_result, constants::nan(), r); } batch_type r1 = other(a); batch_type r2 = select(test, r, r1); #ifdef __FAST_MATH__ return r2; #else return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); #endif } private: // FIXME: cannot mark this one as XSIMD_INLINE because there's a // recursive loop on `lgamma'. static inline batch_type large_negative(const batch_type& q) noexcept { batch_type w = lgamma(q); batch_type p = floor(q); batch_type z = q - p; auto test2 = (z < batch_type(0.5)); z = select(test2, z - batch_type(1.), z); z = q * sin(z, trigo_pi_tag()); z = abs(z); return constants::logpi() - log(z) - w; } static XSIMD_INLINE batch_type other(const batch_type& xx) noexcept { batch_type x = xx; auto test = (x < batch_type(13.)); batch_type r1 = batch_type(0.); if (any(test)) { batch_type z = batch_type(1.); batch_type p = batch_type(0.); batch_type u = select(test, x, batch_type(0.)); auto test1 = (u >= batch_type(3.)); while (any(test1)) { p = select(test1, p - batch_type(1.), p); u = select(test1, x + p, u); z = select(test1, z * u, z); test1 = (u >= batch_type(3.)); } auto test2 = (u < batch_type(2.)); while (any(test2)) { z = select(test2, z / u, z); p = select(test2, p + batch_type(1.), p); u = select(test2, x + p, u); test2 = (u < batch_type(2.)); } z = abs(z); x += p - batch_type(2.); r1 = x * gammaln1(x) + log(z); if (all(test)) return r1; } batch_type r2 = fma(xx - batch_type(0.5), log(xx), constants::logsqrt2pi() - xx); batch_type p = batch_type(1.) / (xx * xx); r2 += gammalnA(p) / xx; return select(test, r1, r2); } }; } template XSIMD_INLINE batch lgamma(batch const& self, requires_arch) noexcept { return detail::lgamma_impl>::compute(self); } // log /* origin: boost/simd/arch/common/simd/function/log.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch log(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(23), k); x = select(test, x * batch_type(8388608ul), x); } #endif i_type ix = ::xsimd::bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = ::xsimd::bitwise_cast(ix); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); detail::reassociation_barrier(dk, "keep compensated k conversion before split log(2) scaling"); batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); #ifdef __FAST_MATH__ return r; #else batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); return select(!(self >= batch_type(0.)), constants::nan(), zz); #endif } template XSIMD_INLINE batch log(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type hx = ::xsimd::bitwise_cast(x) >> 32; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(54), k); x = select(test, x * batch_type(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; batch_type dk = to_float(k); detail::reassociation_barrier(dk, "keep compensated k conversion before split log(2) scaling"); hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); batch_type f = --x; batch_type hfsq = batch_type(0.5) * f * f; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); #ifdef __FAST_MATH__ return r; #else batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); return select(!(self >= batch_type(0.)), constants::nan(), zz); #endif } template XSIMD_INLINE batch, A> log(const batch, A>& z, requires_arch) noexcept { return batch, A>(log(abs(z)), atan2(z.imag(), z.real())); } // log2 template XSIMD_INLINE batch log2(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(25), k); x = select(test, x * batch_type(33554432ul), x); } #endif i_type ix = ::xsimd::bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = ::xsimd::bitwise_cast(ix); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t1 + t2; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion"); batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2(), dk); #ifdef __FAST_MATH__ return r; #else batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); return select(!(self >= batch_type(0.)), constants::nan(), zz); #endif } template XSIMD_INLINE batch log2(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type hx = ::xsimd::bitwise_cast(x) >> 32; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(54), k); x = select(test, x * batch_type(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type hi = f - hfsq; hi = hi & ::xsimd::bitwise_cast((constants::allbits() << 32)); batch_type lo = fma(s, hfsq + R, f - hi - hfsq); batch_type val_hi = hi * constants::invlog_2hi(); batch_type val_lo = fma(lo + hi, constants::invlog_2lo(), lo * constants::invlog_2hi()); batch_type dk = to_float(k); detail::reassociation_barrier(dk, "Kahan compensated log2 summation"); batch_type w1 = dk + val_hi; detail::reassociation_barrier(w1, "Kahan compensated log2 summation"); val_lo += (dk - w1) + val_hi; val_hi = w1; batch_type r = val_lo + val_hi; #ifdef __FAST_MATH__ return r; #else batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); return select(!(self >= batch_type(0.)), constants::nan(), zz); #endif } namespace detail { template XSIMD_INLINE batch logN_complex_impl(const batch& z, typename batch::value_type base) noexcept { using batch_type = batch; using rv_type = typename batch_type::value_type; return log(z) / batch_type(rv_type(base)); } } template XSIMD_INLINE batch, A> log2(batch, A> const& self, requires_arch) noexcept { return detail::logN_complex_impl(self, std::log(2)); } // log10 /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */ /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ template XSIMD_INLINE batch log10(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type ivln10hi(4.3432617188e-01f), ivln10lo(-3.1689971365e-05f), log10_2hi(3.0102920532e-01f), log10_2lo(7.9034151668e-07f); using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(25), k); x = select(test, x * batch_type(33554432ul), x); } #endif i_type ix = ::xsimd::bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = ::xsimd::bitwise_cast(ix); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type dk = to_float(k); detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion"); batch_type hfsq = batch_type(0.5) * f * f; batch_type hibits = f - hfsq; hibits &= ::xsimd::bitwise_cast(i_type(0xfffff000)); batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq); batch_type r = fma(dk, log10_2hi, fma(hibits, ivln10hi, fma(lobits, ivln10hi, fma(lobits + hibits, ivln10lo, dk * log10_2lo)))); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else assert(all(isnez) && "Calling log10 on a batch with zero value while XSIMD_NO_INFINITIES is active"); batch_type zz = r; #endif #ifndef XSIMD_NO_NANS return select(!(self >= batch_type(0.)), constants::nan(), zz); #else assert(all(self >= batch_type(0.)) && "Calling log10 on a batch with negative value while XSIMD_NO_NANS is active"); return zz; #endif } template XSIMD_INLINE batch log10(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type ivln10hi(4.34294481878168880939e-01), ivln10lo(2.50829467116452752298e-11), log10_2hi(3.01029995663611771306e-01), log10_2lo(3.69423907715893078616e-13); using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type hx = ::xsimd::bitwise_cast(x) >> 32; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(54), k); x = select(test, x * batch_type(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; batch_type dk = to_float(k); detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion"); hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type hi = f - hfsq; hi = hi & ::xsimd::bitwise_cast(constants::allbits() << 32); batch_type lo = f - hi - hfsq + s * (hfsq + R); batch_type val_hi = hi * ivln10hi; batch_type y = dk * log10_2hi; batch_type val_lo = dk * log10_2lo + (lo + hi) * ivln10lo + lo * ivln10hi; batch_type w1 = y + val_hi; val_lo += (y - w1) + val_hi; val_hi = w1; batch_type r = val_lo + val_hi; #ifdef __FAST_MATH__ return r; #else batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); return select(!(self >= batch_type(0.)), constants::nan(), zz); #endif } template XSIMD_INLINE batch, A> log10(const batch, A>& z, requires_arch) noexcept { return detail::logN_complex_impl(z, std::log(10)); } // log1p /* origin: boost/simd/arch/common/simd/function/log1p.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch log1p(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; const batch_type uf = self + batch_type(1.); auto isnez = (uf != batch_type(0.)); i_type iu = ::xsimd::bitwise_cast(uf); iu += 0x3f800000 - 0x3f3504f3; i_type k = (iu >> 23) - 0x7f; iu = (iu & i_type(0x007fffff)) + 0x3f3504f3; batch_type f = --(::xsimd::bitwise_cast(iu)); batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion"); /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ batch_type c = select(batch_bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo() + c) - hfsq + f); #ifdef __FAST_MATH__ return r; #else batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); return select(!(uf >= batch_type(0.)), constants::nan(), zz); #endif } template XSIMD_INLINE batch log1p(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; const batch_type uf = self + batch_type(1.); auto isnez = (uf != batch_type(0.)); i_type hu = ::xsimd::bitwise_cast(uf) >> 32; hu += 0x3ff00000 - 0x3fe6a09e; i_type k = (hu >> 20) - 0x3ff; /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ batch_type c = select(batch_bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e; batch_type f = ::xsimd::bitwise_cast((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast(uf))); f = --f; batch_type hfsq = batch_type(0.5) * f * f; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type dk = to_float(k); detail::reassociation_barrier(dk, "prevent distributing multiplies through compensated exponent conversion"); batch_type r = fma(dk, constants::log_2hi(), fma(s, hfsq + R, dk * constants::log_2lo() + c) - hfsq + f); #ifdef __FAST_MATH__ return r; #else batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); return select(!(uf >= batch_type(0.)), constants::nan(), zz); #endif } template XSIMD_INLINE batch, A> log1p(batch, A> const& self, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; batch_type u = 1 + self; batch_type logu = log(u); return select(u == batch_type(1.), self, select(u.real() <= real_batch(0.), logu, logu * self / (u - batch_type(1.)))); } // mod template ::value>> XSIMD_INLINE batch mod(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept -> T { return x % y; }, self, other); } // nearbyint template ::value>> XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return self; } namespace detail { template XSIMD_INLINE batch nearbyintf(batch const& self) noexcept { using batch_type = batch; batch_type s = bitofsign(self); batch_type v = self ^ s; batch_type t2n = constants::twotonmb(); batch_type d0 = v + t2n; detail::reassociation_barrier(d0, "prevent collapsing (v + 2^n) - 2^n back to v"); batch_type d = d0 - t2n; return s ^ select(v < t2n, d, v); } } template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return detail::nearbyintf(self); } template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return detail::nearbyintf(self); } // nearbyint_as_int template ::value>> XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return self; } // nearbyint_as_int template XSIMD_INLINE batch, A> nearbyint_as_int(batch const& self, requires_arch) noexcept { using U = as_integer_t; return kernel::detail::apply_transform([](float x) noexcept -> U { return std::nearbyintf(x); }, self); } template XSIMD_INLINE batch, A> nearbyint_as_int(batch const& self, requires_arch) noexcept { using U = as_integer_t; return kernel::detail::apply_transform([](double x) noexcept -> U { return std::nearbyint(x); }, self); } // nextafter namespace detail { template ::value> struct nextafter_kernel { using batch_type = batch; static XSIMD_INLINE batch_type next(batch_type const& b) noexcept { return b; } static XSIMD_INLINE batch_type prev(batch_type const& b) noexcept { return b; } }; template struct bitwise_cast_batch; template struct bitwise_cast_batch { using type = batch; }; template struct bitwise_cast_batch { using type = batch; }; template struct nextafter_kernel { using batch_type = batch; using int_batch = typename bitwise_cast_batch::type; using int_type = typename int_batch::value_type; static XSIMD_INLINE batch_type next(const batch_type& b) noexcept { batch_type n = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) + int_type(1)); #ifdef __FAST_MATH__ return n; #else return select(b == constants::infinity(), b, n); #endif } static XSIMD_INLINE batch_type prev(const batch_type& b) noexcept { batch_type p = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) - int_type(1)); #ifdef __FAST_MATH__ return p; #else return select(b == constants::minusinfinity(), b, p); #endif } }; } template XSIMD_INLINE batch nextafter(batch const& from, batch const& to, requires_arch) noexcept { using kernel = detail::nextafter_kernel; return select(from == to, from, select(to > from, kernel::next(from), kernel::prev(from))); } // pow /* origin: boost/simd/arch/common/simd/function/pow.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch pow(batch const& self, batch const& other, requires_arch) noexcept { using batch_type = batch; const auto zero = batch_type(0.); auto negself = self < zero; auto iszeropowpos = self == zero && other >= zero; auto adj_self = select(iszeropowpos, batch_type(1), abs(self)); batch_type z = exp(other * log(adj_self)); z = select(iszeropowpos, zero, z); z = select(is_odd(other) && negself, -z, z); auto invalid = negself && !(is_flint(other) || isinf(other)); return select(invalid, constants::nan(), z); } template XSIMD_INLINE batch, A> pow(const batch, A>& a, const batch, A>& z, requires_arch) noexcept { using cplx_batch = batch, A>; using real_batch = typename cplx_batch::real_batch; real_batch absa = abs(a); real_batch arga = arg(a); real_batch x = z.real(); real_batch y = z.imag(); real_batch r = pow(absa, x); real_batch theta = x * arga; real_batch ze(0); auto cond = (y == ze); r = select(cond, r, r * exp(-y * arga)); theta = select(cond, theta, theta + y * log(absa)); auto sincosTheta = xsimd::sincos(theta); return select(absa == ze, cplx_batch(ze), cplx_batch(r * sincosTheta.second, r * sincosTheta.first)); } template inline batch, A> pow(const batch, A>& a, const batch& z, requires_arch) noexcept { using cplx_batch = batch, A>; auto absa = abs(a); auto arga = arg(a); auto r = pow(absa, z); auto theta = z * arga; auto sincosTheta = xsimd::sincos(theta); return select(absa == 0, cplx_batch(0), cplx_batch(r * sincosTheta.second, r * sincosTheta.first)); } template inline batch, A> pow(const batch& a, const batch, A>& z, requires_arch) noexcept { return pow(batch, A> { a, batch {} }, z); } // reciprocal template ::value>> XSIMD_INLINE batch reciprocal(batch const& self, requires_arch) noexcept { using batch_type = batch; return div(batch_type(1), self); } // reduce_add template XSIMD_INLINE std::complex reduce_add(batch, A> const& self, requires_arch) noexcept { return { reduce_add(self.real()), reduce_add(self.imag()) }; } template ::value>*/> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch::size]; self.store_aligned(buffer); T res = 0; for (T val : buffer) { res += val; } return res; } namespace detail { template struct split_high { static constexpr T get(T i, T) { return i < N ? (i + N) : ((i % N) + N); } }; template XSIMD_INLINE T reduce(Op, batch const& self, std::integral_constant) noexcept { return ::xsimd::kernel::first(self, A {}); } template XSIMD_INLINE T reduce(Op op, batch const& self, std::integral_constant) noexcept { using index_type = as_unsigned_integer_t; batch split = swizzle(self, make_batch_constant, A>()); return reduce(op, op(split, self), std::integral_constant()); } } // reduce_max template XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept { return detail::reduce([](batch const& x, batch const& y) { return max(x, y); }, self, std::integral_constant::size>()); } // reduce_min template XSIMD_INLINE T reduce_min(batch const& self, requires_arch) noexcept { return detail::reduce([](batch const& x, batch const& y) { return min(x, y); }, self, std::integral_constant::size>()); } // reduce_mul template XSIMD_INLINE std::complex reduce_mul(batch, A> const& self, requires_arch) noexcept { // FIXME: could do better alignas(A::alignment()) std::complex buffer[batch, A>::size]; self.store_aligned(buffer); std::complex res = 1; for (auto val : buffer) { res *= val; } return res; } template ::value>*/> XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch::size]; self.store_aligned(buffer); T res = 1; for (T val : buffer) { res *= val; } return res; } // remainder template XSIMD_INLINE batch remainder(batch const& self, batch const& other, requires_arch) noexcept { batch q = nearbyint(self / other); detail::reassociation_barrier(q, "prevent pulling multiply back through rounded quotient"); return fnma(q, other, self); } template XSIMD_INLINE batch remainder(batch const& self, batch const& other, requires_arch) noexcept { batch q = nearbyint(self / other); detail::reassociation_barrier(q, "prevent pulling multiply back through rounded quotient"); return fnma(q, other, self); } template ::value>> XSIMD_INLINE batch remainder(batch const& self, batch const& other, requires_arch) noexcept { auto mod = self % other; return select(mod <= other / 2, mod, mod - other); } // select template XSIMD_INLINE batch, A> select(batch_bool const& cond, batch, A> const& true_br, batch, A> const& false_br, requires_arch) noexcept { return { select(cond, true_br.real(), false_br.real()), select(cond, true_br.imag(), false_br.imag()) }; } // sign template ::value>> XSIMD_INLINE batch sign(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type res = select(self > batch_type(0), batch_type(1), batch_type(0)) - select(self < batch_type(0), batch_type(1), batch_type(0)); return res; } namespace detail { template XSIMD_INLINE batch signf(batch const& self) noexcept { using batch_type = batch; batch_type res = select(self > batch_type(0.f), batch_type(1.f), batch_type(0.f)) - select(self < batch_type(0.f), batch_type(1.f), batch_type(0.f)); #ifdef XSIMD_NO_NANS return res; #else return select(isnan(self), constants::nan(), res); #endif } } template XSIMD_INLINE batch sign(batch const& self, requires_arch) noexcept { return detail::signf(self); } template XSIMD_INLINE batch sign(batch const& self, requires_arch) noexcept { return detail::signf(self); } template XSIMD_INLINE batch, A> sign(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; auto rz = z.real(); auto iz = z.imag(); return select(rz != real_batch(0.), batch_type(sign(rz)), batch_type(sign(iz))); } // signnz template ::value>> XSIMD_INLINE batch signnz(batch const& self, requires_arch) noexcept { using batch_type = batch; return (self >> (sizeof(T) * 8 - 1)) | batch_type(1.); } namespace detail { template XSIMD_INLINE batch signnzf(batch const& self) noexcept { using batch_type = batch; #ifndef XSIMD_NO_NANS return select(isnan(self), constants::nan(), batch_type(1.) | (constants::signmask() & self)); #else return batch_type(1.) | (constants::signmask() & self); #endif } } template XSIMD_INLINE batch signnz(batch const& self, requires_arch) noexcept { return detail::signnzf(self); } template XSIMD_INLINE batch signnz(batch const& self, requires_arch) noexcept { return detail::signnzf(self); } // sqrt template XSIMD_INLINE batch, A> sqrt(batch, A> const& z, requires_arch) noexcept { constexpr T csqrt_scale_factor = std::is_same::value ? 6.7108864e7f : 1.8014398509481984e16; constexpr T csqrt_scale = std::is_same::value ? 1.220703125e-4f : 7.450580596923828125e-9; using batch_type = batch, A>; using real_batch = batch; real_batch x = z.real(); real_batch y = z.imag(); real_batch sqrt_x = sqrt(fabs(x)); real_batch sqrt_hy = sqrt(0.5 * fabs(y)); auto cond = (fabs(x) > real_batch(4.) || fabs(y) > real_batch(4.)); x = select(cond, x * 0.25, x * csqrt_scale_factor); y = select(cond, y * 0.25, y * csqrt_scale_factor); real_batch scale = select(cond, real_batch(2.), real_batch(csqrt_scale)); real_batch r = abs(batch_type(x, y)); auto condxp = x > real_batch(0.); real_batch t0 = select(condxp, xsimd::sqrt(0.5 * (r + x)), xsimd::sqrt(0.5 * (r - x))); real_batch r0 = scale * fabs((0.5 * y) / t0); t0 *= scale; real_batch t = select(condxp, t0, r0); r = select(condxp, r0, t0); batch_type resg = select(y < real_batch(0.), batch_type(t, -r), batch_type(t, r)); real_batch ze(0.); return select(y == ze, select(x == ze, batch_type(ze, ze), select(x < ze, batch_type(ze, sqrt_x), batch_type(sqrt_x, ze))), select(x == ze, select(y > ze, batch_type(sqrt_hy, sqrt_hy), batch_type(sqrt_hy, -sqrt_hy)), resg)); } // tgamma namespace detail { /* origin: boost/simd/arch/common/detail/common/stirling_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct stirling_kernel; template struct stirling_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept { return horner(x); } static XSIMD_INLINE batch_type split_limit() noexcept { return batch_type(bit_cast(uint32_t(0x41d628f6))); } static XSIMD_INLINE batch_type large_limit() noexcept { return batch_type(bit_cast(uint32_t(0x420c28f3))); } }; template struct stirling_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept { return horner(x); } static XSIMD_INLINE batch_type split_limit() noexcept { return batch_type(bit_cast(uint64_t(0x4061e083ba3443d4))); } static XSIMD_INLINE batch_type large_limit() noexcept { return batch_type(bit_cast(uint64_t(0x4065800000000000))); } }; /* origin: boost/simd/arch/common/simd/function/stirling.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch stirling(const batch& a) noexcept { using batch_type = batch; const batch_type stirlingsplitlim = stirling_kernel::split_limit(); const batch_type stirlinglargelim = stirling_kernel::large_limit(); batch_type x = select(a >= batch_type(0.), a, constants::nan()); batch_type w = batch_type(1.) / x; w = fma(w, stirling_kernel::compute(w), batch_type(1.)); batch_type y = exp(-x); auto test = (x < stirlingsplitlim); batch_type z = x - batch_type(0.5); z = select(test, z, batch_type(0.5) * z); batch_type v = exp(z * log(abs(x))); y *= v; y = select(test, y, y * v); y *= constants::sqrt_2pi() * w; #ifdef __FAST_MATH__ return y; #else y = select(isinf(x), x, y); return select(x > stirlinglargelim, constants::infinity(), y); #endif } /* origin: boost/simd/arch/common/detail/common/gamma_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct tgamma_kernel; template struct tgamma_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept { return horner(x); } }; template struct tgamma_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept { return horner(x) / horner(x); } }; /* origin: boost/simd/arch/common/simd/function/gamma.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE B tgamma_large_negative(const B& a) noexcept { B st = stirling(a); B p = floor(a); B sgngam = select(is_even(p), -B(1.), B(1.)); B z = a - p; auto test2 = z < B(0.5); z = select(test2, z - B(1.), z); z = a * sin(z, trigo_pi_tag()); z = abs(z); return sgngam * constants::pi() / (z * st); } template XSIMD_INLINE B tgamma_other(const B& a, const BB& test) noexcept { B x = select(test, B(2.), a); #ifndef XSIMD_NO_INFINITIES auto inf_result = (a == constants::infinity()); x = select(inf_result, B(2.), x); #endif B z = B(1.); auto test1 = (x >= B(3.)); while (any(test1)) { x = select(test1, x - B(1.), x); z = select(test1, z * x, z); test1 = (x >= B(3.)); } test1 = (x < B(0.)); while (any(test1)) { z = select(test1, z / x, z); x = select(test1, x + B(1.), x); test1 = (x < B(0.)); } auto test2 = (x < B(2.)); while (any(test2)) { z = select(test2, z / x, z); x = select(test2, x + B(1.), x); test2 = (x < B(2.)); } x = z * tgamma_kernel::compute(x - B(2.)); #ifndef XSIMD_NO_INFINITIES return select(inf_result, a, x); #else return x; #endif } } template XSIMD_INLINE batch tgamma(batch const& self, requires_arch) noexcept { using batch_type = batch; auto nan_result = (self < batch_type(0.) && is_flint(self)); #ifndef XSIMD_NO_NANS nan_result = isnan(self) || nan_result; #endif batch_type q = abs(self); auto test = (self < batch_type(-33.)); batch_type r = constants::nan(); if (any(test)) { r = detail::tgamma_large_negative(q); if (all(test)) return select(nan_result, constants::nan(), r); } batch_type r1 = detail::tgamma_other(self, test); batch_type r2 = select(test, r, r1); #ifdef __FAST_MATH__ return r2; #else return select(self == batch_type(0.), copysign(constants::infinity(), self), select(nan_result, constants::nan(), r2)); #endif } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/common/xsimd_common_memory.hpp000066400000000000000000001353541517435117100275130ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_MEMORY_HPP #define XSIMD_COMMON_MEMORY_HPP #include #include #include #include "../../types/xsimd_batch_constant.hpp" #include "./xsimd_common_details.hpp" namespace xsimd { template struct batch_constant; template struct batch_bool_constant; namespace kernel { using namespace types; // broadcast namespace detail { template struct broadcaster { using return_type = batch; static XSIMD_INLINE return_type run(T v) noexcept { return return_type::broadcast(v); } }; template struct broadcaster { using return_type = batch_bool, A>; static XSIMD_INLINE return_type run(bool b) noexcept { return return_type(b); } }; } // compress namespace detail { template XSIMD_INLINE batch create_compress_swizzle_mask(I bitmask, std::index_sequence) { batch swizzle_mask(IT(0)); alignas(A::alignment()) IT mask_buffer[batch::size] = { Is... }; size_t inserted = 0; for (size_t i = 0; i < sizeof...(Is); ++i) if ((bitmask >> i) & 1u) std::swap(mask_buffer[inserted++], mask_buffer[i]); return batch::load_aligned(&mask_buffer[0]); } } template XSIMD_INLINE batch compress(batch const& x, batch_bool const& mask, kernel::requires_arch) noexcept { using IT = as_unsigned_integer_t; constexpr std::size_t size = batch_bool::size; auto bitmask = mask.mask(); auto z = select(mask, x, batch((T)0)); auto compress_mask = detail::create_compress_swizzle_mask(bitmask, std::make_index_sequence()); return swizzle(z, compress_mask); } // expand template XSIMD_INLINE batch expand(batch const& x, batch_bool const& mask, kernel::requires_arch) noexcept { constexpr auto size = batch::size; alignas(A::alignment()) T x_in[size], x_out[size] = { T() }; x.store_aligned(x_in); int i = 0, j = 0; for (auto bitmask = mask.mask(); bitmask; bitmask >>= 1, ++i) { if (bitmask & 1) x_out[i] = x_in[j++]; } return xsimd::batch::load_aligned(x_out); } // extract_pair template XSIMD_INLINE batch extract_pair(batch const& self, batch const& other, std::size_t i, requires_arch) noexcept { constexpr std::size_t size = batch::size; assert(i < size && "index in bounds"); alignas(A::alignment()) T self_buffer[size]; self.store_aligned(self_buffer); alignas(A::alignment()) T other_buffer[size]; other.store_aligned(other_buffer); alignas(A::alignment()) T concat_buffer[size]; for (std::size_t j = 0; j < (size - i); ++j) { concat_buffer[j] = other_buffer[i + j]; if (j < i) { concat_buffer[size - 1 - j] = self_buffer[i - 1 - j]; } } return batch::load_aligned(concat_buffer); } // gather namespace detail { // Not using XSIMD_INLINE here as it makes msvc hand got ever on avx512 template = 0> inline batch gather(U const* src, batch const& index, ::xsimd::index I) noexcept { return insert(batch {}, static_cast(src[index.get(I)]), I); } template = 0> inline batch gather(U const* src, batch const& index, ::xsimd::index I) noexcept { static_assert(N <= batch::size, "Incorrect value in recursion!"); const auto test = gather(src, index, {}); return insert(test, static_cast(src[index.get(I)]), I); } } // namespace detail template XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Index and destination sizes must match"); return detail::gather::size - 1, T, A>(src, index, {}); } // Gather with runtime indexes and mismatched strides. template XSIMD_INLINE detail::sizes_mismatch_t> gather(batch const&, U const* src, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Index and destination sizes must match"); return detail::gather::size - 1, T, A>(src, index, {}); } // Gather with runtime indexes and matching strides. template XSIMD_INLINE detail::stride_match_t> gather(batch const&, U const* src, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Index and destination sizes must match"); return batch_cast(kernel::gather(batch {}, src, index, A {})); } // insert template XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept { struct index_mask { static constexpr bool get(size_t index, size_t /* size*/) { return index != I; } }; batch tmp(val); return select(make_batch_bool_constant(), self, tmp); } // get template XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch::size]; self.store_aligned(&buffer[0]); return buffer[I]; } template XSIMD_INLINE T get(batch_bool const& self, ::xsimd::index, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch_bool::size]; self.store_aligned(&buffer[0]); return buffer[I]; } template XSIMD_INLINE typename batch, A>::value_type get(batch, A> const& self, ::xsimd::index, requires_arch) noexcept { using value_type = typename batch, A>::value_type; alignas(A::alignment()) value_type buffer[batch, A>::size]; self.store_aligned(&buffer[0]); return buffer[I]; } template XSIMD_INLINE T get(batch const& self, std::size_t i, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch::size]; self.store_aligned(&buffer[0]); return buffer[i]; } template XSIMD_INLINE T get(batch_bool const& self, std::size_t i, requires_arch) noexcept { alignas(A::alignment()) bool buffer[batch_bool::size]; self.store_aligned(&buffer[0]); return buffer[i]; } template XSIMD_INLINE typename batch, A>::value_type get(batch, A> const& self, std::size_t i, requires_arch) noexcept { using T2 = typename batch, A>::value_type; alignas(A::alignment()) T2 buffer[batch, A>::size]; self.store_aligned(&buffer[0]); return buffer[i]; } // first template XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { return get(self, 0, common {}); } template XSIMD_INLINE T first(batch_bool const& self, requires_arch) noexcept { return first(batch(self), A {}); } template XSIMD_INLINE typename batch, A>::value_type first(batch, A> const& self, requires_arch) noexcept { return { first(self.real(), A {}), first(self.imag(), A {}) }; } // load template XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { using batch_type = batch; batch_type ref(0); constexpr auto size = batch_bool::size; alignas(A::alignment()) T buffer[size]; for (std::size_t i = 0; i < size; ++i) buffer[i] = mem[i] ? 1 : 0; return ref != batch_type::load_aligned(&buffer[0]); } template XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool b, requires_arch) noexcept { return load_unaligned(mem, b, A {}); } template XSIMD_INLINE batch_bool load_stream(bool const* mem, batch_bool b, requires_arch) noexcept { return load_aligned(mem, b, A {}); } // load_aligned namespace detail { template XSIMD_INLINE batch load_aligned(T_in const* mem, convert, requires_arch, with_fast_conversion) noexcept { using batch_type_in = batch; using batch_type_out = batch; return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {}); } template XSIMD_INLINE batch load_aligned(T_in const* mem, convert, requires_arch, with_slow_conversion) noexcept { static_assert(!std::is_same::value, "there should be a direct load for this type combination"); using batch_type_out = batch; alignas(A::alignment()) T_out buffer[batch_type_out::size]; std::copy(mem, mem + batch_type_out::size, std::begin(buffer)); return batch_type_out::load_aligned(buffer); } } template XSIMD_INLINE batch load_aligned(T_in const* mem, convert cvt, requires_arch) noexcept { return detail::load_aligned(mem, cvt, A {}, detail::conversion_type {}); } // load_unaligned namespace detail { template XSIMD_INLINE batch load_unaligned(T_in const* mem, convert, requires_arch, with_fast_conversion) noexcept { using batch_type_in = batch; using batch_type_out = batch; return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {}); } template XSIMD_INLINE batch load_unaligned(T_in const* mem, convert cvt, requires_arch, with_slow_conversion) noexcept { static_assert(!std::is_same::value, "there should be a direct load for this type combination"); return load_aligned(mem, cvt, common {}, with_slow_conversion {}); } } template XSIMD_INLINE batch load_unaligned(T_in const* mem, convert cvt, requires_arch) noexcept { return detail::load_unaligned(mem, cvt, common {}, detail::conversion_type {}); } template XSIMD_INLINE batch load(T const* mem, aligned_mode, requires_arch) noexcept { return load_aligned(mem, convert {}, A {}); } template XSIMD_INLINE batch load(T const* mem, unaligned_mode, requires_arch) noexcept { return load_unaligned(mem, convert {}, A {}); } template XSIMD_INLINE batch load_masked(T_in const* mem, batch_bool_constant, convert, alignment, requires_arch) noexcept { constexpr std::size_t size = batch::size; alignas(A::alignment()) std::array buffer {}; constexpr bool mask[size] = { Values... }; for (std::size_t i = 0; i < size; ++i) buffer[i] = mask[i] ? static_cast(mem[i]) : T_out(0); return batch::load(buffer.data(), aligned_mode {}); } template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant, alignment, requires_arch) noexcept { constexpr std::size_t size = batch::size; constexpr bool mask[size] = { Values... }; for (std::size_t i = 0; i < size; ++i) if (mask[i]) { mem[i] = static_cast(src.get(i)); } } template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); return bitwise_cast(f); } template XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto f = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); return bitwise_cast(f); } template XSIMD_INLINE std::enable_if_t::value, batch> load_masked(int64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); return bitwise_cast(d); } template XSIMD_INLINE std::enable_if_t::value, batch> load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto d = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, A {}); return bitwise_cast(d); } template XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } template XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } template XSIMD_INLINE std::enable_if_t::value> store_masked(int64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } template XSIMD_INLINE std::enable_if_t::value> store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { store_masked(reinterpret_cast(mem), bitwise_cast(src), batch_bool_constant {}, Mode {}, A {}); } template XSIMD_INLINE batch load_stream(T_in const* mem, convert cvt, requires_arch) noexcept { return load_aligned(mem, cvt, A {}); } // rotate_right template XSIMD_INLINE batch rotate_right(batch const& self, requires_arch) noexcept { struct rotate_generator { static constexpr size_t get(size_t index, size_t size) { return (index - N) % size; } }; return swizzle(self, make_batch_constant, rotate_generator, A>()); } template XSIMD_INLINE batch, A> rotate_right(batch, A> const& self, requires_arch) noexcept { return { rotate_right(self.real()), rotate_right(self.imag()) }; } // rotate_left template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { struct rotate_generator { static constexpr size_t get(size_t index, size_t size) { return (index + N) % size; } }; return swizzle(self, make_batch_constant, rotate_generator, A>()); } template XSIMD_INLINE batch, A> rotate_left(batch, A> const& self, requires_arch) noexcept { return { rotate_left(self.real()), rotate_left(self.imag()) }; } // Scatter with runtime indexes. namespace detail { template = 0> XSIMD_INLINE void scatter(batch const& src, U* dst, batch const& index, ::xsimd::index I) noexcept { dst[index.get(I)] = static_cast(src.get(I)); } template = 0> XSIMD_INLINE void scatter(batch const& src, U* dst, batch const& index, ::xsimd::index I) noexcept { static_assert(N <= batch::size, "Incorrect value in recursion!"); kernel::detail::scatter( src, dst, index, {}); dst[index.get(I)] = static_cast(src.get(I)); } } // namespace detail template XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Source and index sizes must match"); kernel::detail::scatter::size - 1, T, A, T, V>( src, dst, index, {}); } template XSIMD_INLINE detail::sizes_mismatch_t scatter(batch const& src, U* dst, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Source and index sizes must match"); kernel::detail::scatter::size - 1, T, A, U, V>( src, dst, index, {}); } template XSIMD_INLINE detail::stride_match_t scatter(batch const& src, U* dst, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Source and index sizes must match"); const auto tmp = batch_cast(src); kernel::scatter(tmp, dst, index, A {}); } // shuffle namespace detail { constexpr bool is_swizzle_fst(size_t) { return true; } template constexpr bool is_swizzle_fst(size_t bsize, ITy index, ITys... indices) { return index < bsize && is_swizzle_fst(bsize, indices...); } constexpr bool is_swizzle_snd(size_t) { return true; } template constexpr bool is_swizzle_snd(size_t bsize, ITy index, ITys... indices) { return index >= bsize && is_swizzle_snd(bsize, indices...); } constexpr bool is_zip_lo(size_t) { return true; } template constexpr bool is_zip_lo(size_t, ITy) { return false; } template constexpr bool is_zip_lo(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices) { return index0 == (bsize - (sizeof...(indices) + 2)) && index1 == (2 * bsize - (sizeof...(indices) + 2)) && is_zip_lo(bsize, indices...); } constexpr bool is_zip_hi(size_t) { return true; } template constexpr bool is_zip_hi(size_t, ITy) { return false; } template constexpr bool is_zip_hi(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices) { return index0 == (bsize / 2 + bsize - (sizeof...(indices) + 2)) && index1 == (bsize / 2 + 2 * bsize - (sizeof...(indices) + 2)) && is_zip_hi(bsize, indices...); } constexpr bool is_select(size_t) { return true; } template constexpr bool is_select(size_t bsize, ITy index, ITys... indices) { return (index < bsize ? index : index - bsize) == (bsize - sizeof...(ITys)) && is_select(bsize, indices...); } } template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept { constexpr size_t bsize = sizeof...(Indices); static_assert(bsize == batch::size, "valid shuffle"); // Detect common patterns XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...)) { return swizzle(x, batch_constant= bsize) ? 0 /* never happens */ : Indices)...>()); } XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...)) { return swizzle(y, batch_constant= bsize) ? (Indices - bsize) : 0 /* never happens */)...>()); } XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...)) { return zip_lo(x, y); } XSIMD_IF_CONSTEXPR(detail::is_zip_hi(bsize, Indices...)) { return zip_hi(x, y); } XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...)) { return select(batch_bool_constant(), x, y); } #if defined(__has_builtin) && !defined(XSIMD_WITH_EMULATED) #if __has_builtin(__builtin_shufflevector) #define builtin_shuffle __builtin_shufflevector #endif #endif #if defined(builtin_shuffle) typedef T vty __attribute__((__vector_size__(sizeof(batch)))); return (typename batch::register_type)builtin_shuffle((vty)x.data, (vty)y.data, Indices...); // FIXME: my experiments show that GCC only correctly optimizes this builtin // starting at GCC 13, where it already has __builtin_shuffle_vector // // #elif __has_builtin(__builtin_shuffle) || GCC >= 6 // typedef ITy integer_vector_type __attribute__((vector_size(sizeof(batch)))); // return __builtin_shuffle(x.data, y.data, integer_vector_type{Indices...}); #else // Use a common_pattern. It is suboptimal but clang optimizes this // pretty well. batch x_lane = swizzle(x, batch_constant= bsize) ? (Indices - bsize) : Indices)...>()); batch y_lane = swizzle(y, batch_constant= bsize) ? (Indices - bsize) : Indices)...>()); batch_bool_constant select_x_lane; return select(select_x_lane, x_lane, y_lane); #endif } // store template XSIMD_INLINE void store(batch_bool const& self, bool* mem, requires_arch) noexcept { using batch_type = batch; constexpr auto size = batch_bool::size; alignas(A::alignment()) T buffer[size]; kernel::store_aligned(&buffer[0], batch_type(self), A {}); for (std::size_t i = 0; i < size; ++i) mem[i] = bool(buffer[i]); } template XSIMD_INLINE void store_stream(batch_bool const& self, bool* mem, requires_arch) noexcept { store(self, mem, A {}); } // store_aligned template XSIMD_INLINE void store_aligned(T_out* mem, batch const& self, requires_arch) noexcept { static_assert(!std::is_same::value, "there should be a direct store for this type combination"); alignas(A::alignment()) T_in buffer[batch::size]; store_aligned(&buffer[0], self); std::copy(std::begin(buffer), std::end(buffer), mem); } // store_unaligned template XSIMD_INLINE void store_unaligned(T_out* mem, batch const& self, requires_arch) noexcept { static_assert(!std::is_same::value, "there should be a direct store for this type combination"); return store_aligned(mem, self, common {}); } template XSIMD_INLINE void store_stream(T_out* mem, batch const& self, requires_arch) noexcept { store_aligned(mem, self, A {}); } // swizzle template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch_constant mask, requires_arch) noexcept { return { swizzle(self.real(), mask), swizzle(self.imag(), mask) }; } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { constexpr size_t size = batch::size; alignas(A::alignment()) T self_buffer[size]; store_aligned(&self_buffer[0], self); alignas(A::alignment()) ITy mask_buffer[size]; store_aligned(&mask_buffer[0], mask); alignas(A::alignment()) T out_buffer[size]; for (size_t i = 0; i < size; ++i) out_buffer[i] = self_buffer[mask_buffer[i]]; return batch::load_aligned(out_buffer); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr size_t size = batch::size; alignas(A::alignment()) T self_buffer[size]; store_aligned(&self_buffer[0], self); return { self_buffer[Is]... }; } template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch mask, requires_arch) noexcept { return { swizzle(self.real(), mask), swizzle(self.imag(), mask) }; } // load_complex_aligned namespace detail { template XSIMD_INLINE batch, A> load_complex(batch const& /*hi*/, batch const& /*lo*/, requires_arch) noexcept { static_assert(std::is_same::value, "load_complex not implemented for the required architecture"); } template XSIMD_INLINE batch complex_high(batch, A> const& /*src*/, requires_arch) noexcept { static_assert(std::is_same::value, "complex_high not implemented for the required architecture"); } template XSIMD_INLINE batch complex_low(batch, A> const& /*src*/, requires_arch) noexcept { static_assert(std::is_same::value, "complex_low not implemented for the required architecture"); } } template XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept { using real_batch = batch; T_in const* buffer = reinterpret_cast(mem); real_batch hi = real_batch::load_aligned(buffer), lo = real_batch::load_aligned(buffer + real_batch::size); return detail::load_complex(hi, lo, A {}); } // load_complex_unaligned template XSIMD_INLINE batch, A> load_complex_unaligned(std::complex const* mem, convert>, requires_arch) noexcept { using real_batch = batch; T_in const* buffer = reinterpret_cast(mem); real_batch hi = real_batch::load_unaligned(buffer), lo = real_batch::load_unaligned(buffer + real_batch::size); return detail::load_complex(hi, lo, A {}); } template XSIMD_INLINE batch, A> load_complex_stream(std::complex const* mem, convert>, requires_arch) noexcept { return load_complex_aligned(mem, kernel::convert> {}, A {}); } // store_complex_aligned template XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { using real_batch = batch; real_batch hi = detail::complex_high(src, A {}); real_batch lo = detail::complex_low(src, A {}); T_out* buffer = reinterpret_cast(dst); lo.store_aligned(buffer); hi.store_aligned(buffer + real_batch::size); } // store_complex_unaligned template XSIMD_INLINE void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { using real_batch = batch; real_batch hi = detail::complex_high(src, A {}); real_batch lo = detail::complex_low(src, A {}); T_out* buffer = reinterpret_cast(dst); lo.store_unaligned(buffer); hi.store_unaligned(buffer + real_batch::size); } template XSIMD_INLINE void store_complex_stream(std::complex* dst, batch, A> const& src, requires_arch) noexcept { store_complex_aligned(dst, src, A {}); } // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; alignas(A::alignment()) T scratch_buffer[batch::size * batch::size]; for (size_t i = 0; i < batch::size; ++i) { matrix_begin[i].store_aligned(&scratch_buffer[i * batch::size]); } // FIXME: this is super naive we can probably do better. for (size_t i = 0; i < batch::size; ++i) { for (size_t j = 0; j < i; ++j) { std::swap(scratch_buffer[i * batch::size + j], scratch_buffer[j * batch::size + i]); } } for (size_t i = 0; i < batch::size; ++i) { matrix_begin[i] = batch::load_aligned(&scratch_buffer[i * batch::size]); } } // transpose template ::size == 8>> XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto l0 = zip_lo(matrix_begin[0], matrix_begin[1]); auto l1 = zip_lo(matrix_begin[2], matrix_begin[3]); auto l2 = zip_lo(matrix_begin[4], matrix_begin[5]); auto l3 = zip_lo(matrix_begin[6], matrix_begin[7]); auto l4 = zip_lo(bit_cast>(l0), bit_cast>(l1)); auto l5 = zip_lo(bit_cast>(l2), bit_cast>(l3)); auto l6 = zip_hi(bit_cast>(l0), bit_cast>(l1)); auto l7 = zip_hi(bit_cast>(l2), bit_cast>(l3)); auto h0 = zip_hi(matrix_begin[0], matrix_begin[1]); auto h1 = zip_hi(matrix_begin[2], matrix_begin[3]); auto h2 = zip_hi(matrix_begin[4], matrix_begin[5]); auto h3 = zip_hi(matrix_begin[6], matrix_begin[7]); auto h4 = zip_lo(bit_cast>(h0), bit_cast>(h1)); auto h5 = zip_lo(bit_cast>(h2), bit_cast>(h3)); auto h6 = zip_hi(bit_cast>(h0), bit_cast>(h1)); auto h7 = zip_hi(bit_cast>(h2), bit_cast>(h3)); matrix_begin[0] = bit_cast>(zip_lo(bit_cast>(l4), bit_cast>(l5))); matrix_begin[1] = bit_cast>(zip_hi(bit_cast>(l4), bit_cast>(l5))); matrix_begin[2] = bit_cast>(zip_lo(bit_cast>(l6), bit_cast>(l7))); matrix_begin[3] = bit_cast>(zip_hi(bit_cast>(l6), bit_cast>(l7))); matrix_begin[4] = bit_cast>(zip_lo(bit_cast>(h4), bit_cast>(h5))); matrix_begin[5] = bit_cast>(zip_hi(bit_cast>(h4), bit_cast>(h5))); matrix_begin[6] = bit_cast>(zip_lo(bit_cast>(h6), bit_cast>(h7))); matrix_begin[7] = bit_cast>(zip_hi(bit_cast>(h6), bit_cast>(h7))); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template ::size == 16>> XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto l0 = zip_lo(matrix_begin[0], matrix_begin[1]); auto l1 = zip_lo(matrix_begin[2], matrix_begin[3]); auto l2 = zip_lo(matrix_begin[4], matrix_begin[5]); auto l3 = zip_lo(matrix_begin[6], matrix_begin[7]); auto l4 = zip_lo(matrix_begin[8], matrix_begin[9]); auto l5 = zip_lo(matrix_begin[10], matrix_begin[11]); auto l6 = zip_lo(matrix_begin[12], matrix_begin[13]); auto l7 = zip_lo(matrix_begin[14], matrix_begin[15]); auto h0 = zip_hi(matrix_begin[0], matrix_begin[1]); auto h1 = zip_hi(matrix_begin[2], matrix_begin[3]); auto h2 = zip_hi(matrix_begin[4], matrix_begin[5]); auto h3 = zip_hi(matrix_begin[6], matrix_begin[7]); auto h4 = zip_hi(matrix_begin[8], matrix_begin[9]); auto h5 = zip_hi(matrix_begin[10], matrix_begin[11]); auto h6 = zip_hi(matrix_begin[12], matrix_begin[13]); auto h7 = zip_hi(matrix_begin[14], matrix_begin[15]); auto L0 = zip_lo(bit_cast>(l0), bit_cast>(l1)); auto L1 = zip_lo(bit_cast>(l2), bit_cast>(l3)); auto L2 = zip_lo(bit_cast>(l4), bit_cast>(l5)); auto L3 = zip_lo(bit_cast>(l6), bit_cast>(l7)); auto m0 = zip_lo(bit_cast>(L0), bit_cast>(L1)); auto m1 = zip_lo(bit_cast>(L2), bit_cast>(L3)); auto m2 = zip_hi(bit_cast>(L0), bit_cast>(L1)); auto m3 = zip_hi(bit_cast>(L2), bit_cast>(L3)); matrix_begin[0] = bit_cast>(zip_lo(bit_cast>(m0), bit_cast>(m1))); matrix_begin[1] = bit_cast>(zip_hi(bit_cast>(m0), bit_cast>(m1))); matrix_begin[2] = bit_cast>(zip_lo(bit_cast>(m2), bit_cast>(m3))); matrix_begin[3] = bit_cast>(zip_hi(bit_cast>(m2), bit_cast>(m3))); auto L4 = zip_hi(bit_cast>(l0), bit_cast>(l1)); auto L5 = zip_hi(bit_cast>(l2), bit_cast>(l3)); auto L6 = zip_hi(bit_cast>(l4), bit_cast>(l5)); auto L7 = zip_hi(bit_cast>(l6), bit_cast>(l7)); auto m4 = zip_lo(bit_cast>(L4), bit_cast>(L5)); auto m5 = zip_lo(bit_cast>(L6), bit_cast>(L7)); auto m6 = zip_hi(bit_cast>(L4), bit_cast>(L5)); auto m7 = zip_hi(bit_cast>(L6), bit_cast>(L7)); matrix_begin[4] = bit_cast>(zip_lo(bit_cast>(m4), bit_cast>(m5))); matrix_begin[5] = bit_cast>(zip_hi(bit_cast>(m4), bit_cast>(m5))); matrix_begin[6] = bit_cast>(zip_lo(bit_cast>(m6), bit_cast>(m7))); matrix_begin[7] = bit_cast>(zip_hi(bit_cast>(m6), bit_cast>(m7))); auto H0 = zip_lo(bit_cast>(h0), bit_cast>(h1)); auto H1 = zip_lo(bit_cast>(h2), bit_cast>(h3)); auto H2 = zip_lo(bit_cast>(h4), bit_cast>(h5)); auto H3 = zip_lo(bit_cast>(h6), bit_cast>(h7)); auto M0 = zip_lo(bit_cast>(H0), bit_cast>(H1)); auto M1 = zip_lo(bit_cast>(H2), bit_cast>(H3)); auto M2 = zip_hi(bit_cast>(H0), bit_cast>(H1)); auto M3 = zip_hi(bit_cast>(H2), bit_cast>(H3)); matrix_begin[8] = bit_cast>(zip_lo(bit_cast>(M0), bit_cast>(M1))); matrix_begin[9] = bit_cast>(zip_hi(bit_cast>(M0), bit_cast>(M1))); matrix_begin[10] = bit_cast>(zip_lo(bit_cast>(M2), bit_cast>(M3))); matrix_begin[11] = bit_cast>(zip_hi(bit_cast>(M2), bit_cast>(M3))); auto H4 = zip_hi(bit_cast>(h0), bit_cast>(h1)); auto H5 = zip_hi(bit_cast>(h2), bit_cast>(h3)); auto H6 = zip_hi(bit_cast>(h4), bit_cast>(h5)); auto H7 = zip_hi(bit_cast>(h6), bit_cast>(h7)); auto M4 = zip_lo(bit_cast>(H4), bit_cast>(H5)); auto M5 = zip_lo(bit_cast>(H6), bit_cast>(H7)); auto M6 = zip_hi(bit_cast>(H4), bit_cast>(H5)); auto M7 = zip_hi(bit_cast>(H6), bit_cast>(H7)); matrix_begin[12] = bit_cast>(zip_lo(bit_cast>(M4), bit_cast>(M5))); matrix_begin[13] = bit_cast>(zip_hi(bit_cast>(M4), bit_cast>(M5))); matrix_begin[14] = bit_cast>(zip_lo(bit_cast>(M6), bit_cast>(M7))); matrix_begin[15] = bit_cast>(zip_hi(bit_cast>(M6), bit_cast>(M7))); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/common/xsimd_common_rounding.hpp000066400000000000000000000053011517435117100300140ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_ROUNDING_HPP #define XSIMD_COMMON_ROUNDING_HPP #include "./xsimd_common_details.hpp" namespace xsimd { namespace kernel { using namespace types; // ceil template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { batch truncated_self = trunc(self); return select(truncated_self < self, truncated_self + 1, truncated_self); } // floor template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { batch truncated_self = trunc(self); return select(truncated_self > self, truncated_self - 1, truncated_self); } // round template XSIMD_INLINE batch round(batch const& self, requires_arch) noexcept { auto v = abs(self); auto c = ceil(v); auto cp = select(c - 0.5 > v, c - 1, c); return select(v > constants::maxflint>(), self, copysign(cp, self)); } // trunc template ::value>> XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return self; } template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); } template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/common/xsimd_common_swizzle.hpp000066400000000000000000000321041517435117100276770ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * Copyright (c) Marco Barbone * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software.* ****************************************************************************/ #ifndef XSIMD_COMMON_SWIZZLE_HPP #define XSIMD_COMMON_SWIZZLE_HPP #include #include #include #include "../../config/xsimd_macros.hpp" namespace xsimd { template struct batch_constant; namespace kernel { namespace detail { // ──────────────────────────────────────────────────────────────────────── // get_at → the I-th element of the pack template struct get_at { static constexpr T value = get_at::value; }; template struct get_at { static constexpr T value = V0; }; // ──────────────────────────────────────────────────────────────────────── // identity_impl template XSIMD_INLINE constexpr bool identity_impl() noexcept { return true; } template XSIMD_INLINE constexpr bool identity_impl() noexcept { return V0 == static_cast(I) && identity_impl(); } // ──────────────────────────────────────────────────────────────────────── // dup_lo_impl template = 0> XSIMD_INLINE constexpr bool dup_lo_impl() noexcept { return true; } template = 0> XSIMD_INLINE constexpr bool dup_lo_impl() noexcept { return get_at::value < static_cast(N / 2) && get_at::value == get_at::value && dup_lo_impl(); } // ──────────────────────────────────────────────────────────────────────── // dup_hi_impl template = 0> XSIMD_INLINE constexpr bool dup_hi_impl() noexcept { return true; } template = 0> XSIMD_INLINE constexpr bool dup_hi_impl() noexcept { return get_at::value >= static_cast(N / 2) && get_at::value < static_cast(N) && get_at::value == get_at::value && dup_hi_impl(); } // ──────────────────────────────────────────────────────────────────────── // only_from_lo template struct only_from_lo_impl; template struct only_from_lo_impl { static constexpr bool value = (Last < (Size / 2)); }; template struct only_from_lo_impl { static constexpr bool value = (First < (Size / 2)) && only_from_lo_impl::value; }; template constexpr bool is_only_from_lo() { return only_from_lo_impl::value; }; // ──────────────────────────────────────────────────────────────────────── // only_from_hi template struct only_from_hi_impl; template struct only_from_hi_impl { static constexpr bool value = (Last >= (Size / 2)); }; template struct only_from_hi_impl { static constexpr bool value = (First >= (Size / 2)) && only_from_hi_impl::value; }; template constexpr bool is_only_from_hi() { return only_from_hi_impl::value; }; // ──────────────────────────────────────────────────────────────────────── // 1) helper to get the I-th value from the Vs pack template struct get_nth_value { static constexpr uint32_t value = get_nth_value::value; }; template struct get_nth_value<0, Head, Tail...> { static constexpr uint32_t value = Head; }; // ──────────────────────────────────────────────────────────────────────── // 2) recursive cross‐lane test: true if any output‐lane i pulls from the opposite half template struct cross_impl { // does element I cross? (i.e. i=H) or (i>=H but V::value; static constexpr bool curr = (I < H ? (Vi >= H) : (Vi < H)); static constexpr bool next = cross_impl::value; static constexpr bool value = curr || next; }; template struct cross_impl { static constexpr bool value = false; }; template XSIMD_INLINE constexpr bool is_cross_lane() noexcept { static_assert(sizeof...(Vs) >= 1, "Need at least one lane"); return cross_impl<0, sizeof...(Vs), sizeof...(Vs) / 2, Vs...>::value; } /** * @brief Internal: Check if a swizzle pattern crosses lane boundaries * * @tparam LaneSizeBytes Size of a lane in bytes (must be > 0) * @tparam ElemT Element type to determine element size * @tparam U Type of the index values * @tparam Vs... Index values for the swizzle pattern * * @return true if any element accesses data from a different lane * * This is an internal helper. Architecture-specific code can call this directly * with explicit lane sizes (e.g., detail::is_cross_lane_with_lane_size<16, float, ...>() * for 128-bit lanes). */ template XSIMD_INLINE constexpr bool is_cross_lane_with_lane_size() noexcept { static_assert(std::is_integral::value, "swizzle mask values must be integral"); static_assert(sizeof...(Vs) >= 1, "need at least one value"); static_assert(LaneSizeBytes > 0, "lane size must be positive"); constexpr std::size_t lane_elems = LaneSizeBytes / sizeof(ElemT); constexpr U values[] = { Vs... }; constexpr std::size_t N = sizeof...(Vs); for (std::size_t i = 0; i < N; ++i) { std::size_t elem_lane = i / lane_elems; std::size_t target_lane = static_cast(values[i]) / lane_elems; if (elem_lane != target_lane) return true; } return false; } template XSIMD_INLINE constexpr bool is_identity() noexcept { return detail::identity_impl<0, T, Vs...>(); } template XSIMD_INLINE constexpr bool is_dup_lo() noexcept { return detail::dup_lo_impl<0, sizeof...(Vs), T, Vs...>(); } template XSIMD_INLINE constexpr bool is_dup_hi() noexcept { return detail::dup_hi_impl<0, sizeof...(Vs), T, Vs...>(); } template XSIMD_INLINE constexpr bool is_identity(batch_constant) noexcept { return is_identity(); } template XSIMD_INLINE constexpr bool is_dup_lo(batch_constant) noexcept { return is_dup_lo(); } template XSIMD_INLINE constexpr bool is_dup_hi(batch_constant) noexcept { return is_dup_hi(); } template XSIMD_INLINE constexpr bool is_only_from_lo(batch_constant) noexcept { return detail::is_only_from_lo(); } template XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant) noexcept { return detail::is_only_from_hi(); } template XSIMD_INLINE constexpr bool is_cross_lane(batch_constant) noexcept { return detail::is_cross_lane_with_lane_size<16, T, T, Vs...>(); } /** * @brief Public: Check if a swizzle pattern crosses 128-bit lane boundaries * * Checks if indices cross 128-bit (16-byte) lane boundaries, which is the * standard lane size for SSE/AVX/AVX512 shuffle operations. * * @tparam ElemT Element type to determine element size * @tparam U Type of the index values * @tparam Vs... Index values for the swizzle pattern * * @return true if any element accesses data from a different 128-bit lane * * Examples: * - is_cross_lane() // no crossing (within 128-bit) * - is_cross_lane() // crosses */ template XSIMD_INLINE constexpr bool is_cross_lane() noexcept { return is_cross_lane_with_lane_size<16, ElemT, U, Vs...>(); } // Overload with std::size_t indices template XSIMD_INLINE constexpr bool is_cross_lane() noexcept { return is_cross_lane(); } } // namespace detail } // namespace kernel } // namespace xsimd #endif // XSIMD_COMMON_SWIZZLE_HPP xtensor-stack-xsimd-541558d/include/xsimd/arch/common/xsimd_common_trigo.hpp000066400000000000000000001325331517435117100273230ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_TRIGO_HPP #define XSIMD_COMMON_TRIGO_HPP #include "./xsimd_common_details.hpp" #include namespace xsimd { namespace kernel { /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ using namespace types; // acos template XSIMD_INLINE batch acos(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto x_larger_05 = x > batch_type(0.5); x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self); x = asin(x); x = select(x_larger_05, x + x, x); x = select(self < batch_type(-0.5), constants::pi() - x, x); return select(x_larger_05, x, constants::pio2() - x); } template XSIMD_INLINE batch, A> acos(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; batch_type tmp = asin(z); return { constants::pio2() - tmp.real(), -tmp.imag() }; } // acosh /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch acosh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = self - batch_type(1.); auto test = x > constants::oneotwoeps(); batch_type z = select(test, self, x + sqrt(x + x + x * x)); batch_type l1pz = log1p(z); return select(test, l1pz + constants::log_2(), l1pz); } template XSIMD_INLINE batch, A> acosh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = acos(z); w = batch_type(-w.imag(), w.real()); return w; } // asin template XSIMD_INLINE batch asin(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type sign = bitofsign(self); auto x_larger_05 = x > batch_type(0.5); batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x); x = select(x_larger_05, sqrt(z), x); batch_type z1 = detail::horner(z); z1 = fma(z1, z * x, x); z = select(x_larger_05, constants::pio2() - (z1 + z1), z1); return z ^ sign; } template XSIMD_INLINE batch asin(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto small_cond = x < constants::sqrteps(); batch_type ct1 = batch_type(bit_cast(int64_t(0x3fe4000000000000))); batch_type zz1 = batch_type(1.) - x; batch_type vp = zz1 * detail::horner(zz1) / detail::horner1(zz1); zz1 = sqrt(zz1 + zz1); batch_type z = constants::pio4() - zz1; zz1 = fms(zz1, vp, constants::pio_2lo()); z = z - zz1; zz1 = z + constants::pio4(); batch_type zz2 = self * self; z = zz2 * detail::horner(zz2) / detail::horner1(zz2); zz2 = fma(x, z, x); return select(x > batch_type(1.), constants::nan(), select(small_cond, x, select(x > ct1, zz1, zz2)) ^ bitofsign(self)); } template XSIMD_INLINE batch, A> asin(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch x = z.real(); real_batch y = z.imag(); batch_type ct(-y, x); batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y); zz = log(ct + sqrt(zz)); batch_type resg(zz.imag(), -zz.real()); return select(y == real_batch(0.), select(fabs(x) > real_batch(1.), batch_type(constants::pio2(), real_batch(0.)), batch_type(asin(x), real_batch(0.))), resg); } // asinh /* origin: boost/simd/arch/common/simd/function/asinh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { template ::value>> XSIMD_INLINE batch average(const batch& x1, const batch& x2) noexcept { return (x1 & x2) + ((x1 ^ x2) >> 1); } template XSIMD_INLINE batch averagef(const batch& x1, const batch& x2) noexcept { using batch_type = batch; return fma(x1, batch_type(0.5), x2 * batch_type(0.5)); } template XSIMD_INLINE batch average(batch const& x1, batch const& x2) noexcept { return averagef(x1, x2); } template XSIMD_INLINE batch average(batch const& x1, batch const& x2) noexcept { return averagef(x1, x2); } } template XSIMD_INLINE batch asinh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto lthalf = x < batch_type(0.5); batch_type x2 = x * x; batch_type bts = bitofsign(self); batch_type z(0.); if (any(lthalf)) { z = detail::horner(x2) * x; if (all(lthalf)) return z ^ bts; } batch_type tmp = select(x > constants::oneosqrteps(), x, detail::average(x, hypot(batch_type(1.), x))); #ifndef XSIMD_NO_NANS return select(isnan(self), constants::nan(), select(lthalf, z, log(tmp) + constants::log_2()) ^ bts); #else return select(lthalf, z, log(tmp) + constants::log_2()) ^ bts; #endif } template XSIMD_INLINE batch asinh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto test = x > constants::oneosqrteps(); batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x))); #ifndef XSIMD_NO_INFINITIES z = select(x == constants::infinity(), x, z); #endif batch_type l1pz = log1p(z); z = select(test, l1pz + constants::log_2(), l1pz); return bitofsign(self) ^ z; } template XSIMD_INLINE batch, A> asinh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = asin(batch_type(-z.imag(), z.real())); w = batch_type(w.imag(), -w.real()); return w; } // atan namespace detail { template static XSIMD_INLINE batch kernel_atan(const batch& x, const batch& recx) noexcept { using batch_type = batch; const auto flag1 = x < constants::tan3pio8(); const auto flag2 = (x >= batch_type(bit_cast((uint32_t)0x3ed413cd))) && flag1; batch_type yy = select(flag1, batch_type(0.), constants::pio2()); yy = select(flag2, constants::pio4(), yy); batch_type xx = select(flag1, x, -recx); xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); const batch_type z = xx * xx; batch_type z1 = detail::horner(z); z1 = fma(xx, z1 * z, xx); z1 = select(flag2, z1 + constants::pio_4lo(), z1); z1 = select(!flag1, z1 + constants::pio_2lo(), z1); return yy + z1; } template static XSIMD_INLINE batch kernel_atan(const batch& x, const batch& recx) noexcept { using batch_type = batch; const auto flag1 = x < constants::tan3pio8(); const auto flag2 = (x >= constants::tanpio8()) && flag1; batch_type yy = select(flag1, batch_type(0.), constants::pio2()); yy = select(flag2, constants::pio4(), yy); batch_type xx = select(flag1, x, -recx); xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); batch_type z = xx * xx; z *= detail::horner(z) / detail::horner1(z); z = fma(xx, z, xx); z = select(flag2, z + constants::pio_4lo(), z); z = z + select(flag1, batch_type(0.), constants::pio_2lo()); return yy + z; } } template XSIMD_INLINE batch atan(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type absa = abs(self); const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa); return x ^ bitofsign(self); } template XSIMD_INLINE batch, A> atan(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch x = z.real(); real_batch y = z.imag(); real_batch x2 = x * x; real_batch one(1.); real_batch a = one - x2 - (y * y); real_batch w = 0.5 * atan2(2. * x, a); real_batch num = y + one; num = x2 + num * num; real_batch den = y - one; den = x2 + den * den; #ifdef __FAST_MATH__ return batch_type(w, 0.25 * log(num / den)); #else return select((x == real_batch(0.)) && (y == real_batch(1.)), batch_type(real_batch(0.), constants::infinity()), batch_type(w, 0.25 * log(num / den))); #endif } // atanh /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch atanh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type t = x + x; batch_type z = batch_type(1.) - x; auto test = x < batch_type(0.5); batch_type tmp = select(test, x, t) / z; return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp))); } template XSIMD_INLINE batch, A> atanh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = atan(batch_type(-z.imag(), z.real())); w = batch_type(w.imag(), -w.real()); return w; } // atan2 template XSIMD_INLINE batch atan2(batch const& self, batch const& other, requires_arch) noexcept { using batch_type = batch; #ifdef __FAST_MATH__ const batch_type q = abs(self / other); const batch_type q_p = abs(other / self); #else const batch_type q = abs(self / other); const batch_type q_p = 1. / q; #endif const batch_type z = detail::kernel_atan(q, q_p); return select(other > batch_type(0.), z, constants::pi() - z) * signnz(self); } // cos namespace detail { template XSIMD_INLINE batch quadrant(const batch& x) noexcept { return x & batch(3); } template XSIMD_INLINE batch quadrant(const batch& x) noexcept { return to_float(quadrant(to_int(x))); } template XSIMD_INLINE batch quadrant(const batch& x) noexcept { using batch_type = batch; batch_type a = x * batch_type(0.25); return (a - floor(a)) * batch_type(4.); } /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch cos_eval(const batch& z) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z); } template XSIMD_INLINE batch sin_eval(const batch& z, const batch& x) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return fma(y * z, x, x); } template static XSIMD_INLINE batch base_tancot_eval(const batch& z) noexcept { using batch_type = batch; batch_type zz = z * z; batch_type y = detail::horner(zz); return fma(y, zz * z, z); } template static XSIMD_INLINE batch tan_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, y, -batch_type(1.) / y); } template static XSIMD_INLINE batch cot_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, batch_type(1.) / y, -y); } /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template static XSIMD_INLINE batch cos_eval(const batch& z) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return batch_type(1.) - y * z; } template static XSIMD_INLINE batch sin_eval(const batch& z, const batch& x) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return fma(y * z, x, x); } template static XSIMD_INLINE batch base_tancot_eval(const batch& z) noexcept { using batch_type = batch; batch_type zz = z * z; batch_type num = detail::horner(zz); batch_type den = detail::horner1(zz); return fma(z, (zz * (num / den)), z); } template static XSIMD_INLINE batch tan_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, y, -batch_type(1.) / y); } template static XSIMD_INLINE batch cot_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, batch_type(1.) / y, -y); } /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ struct trigo_radian_tag { }; struct trigo_pi_tag { }; template struct trigo_reducer { static XSIMD_INLINE B reduce(const B& x, B& xr) noexcept { if (all(x <= constants::pio4())) { xr = x; return B(0.); } else if (all(x <= constants::pio2())) { auto test = x > constants::pio4(); xr = x - constants::pio2_1(); detail::reassociation_barrier(xr, "ordered pio2 subtraction"); xr -= constants::pio2_2(); detail::reassociation_barrier(xr, "ordered pio2 subtraction"); xr -= constants::pio2_3(); detail::reassociation_barrier(xr, "ordered pio2 subtraction"); xr = select(test, xr, x); return select(test, B(1.), B(0.)); } else if (all(x <= constants::twentypi())) { B xi = nearbyint(x * constants::twoopi()); detail::reassociation_barrier(xi, "preserve quadrant selection"); xr = fnma(xi, constants::pio2_1(), x); detail::reassociation_barrier(xr, "compensated range reduction"); xr -= xi * constants::pio2_2(); detail::reassociation_barrier(xr, "compensated range reduction"); xr -= xi * constants::pio2_3(); detail::reassociation_barrier(xr, "compensated range reduction"); return quadrant(xi); } else if (all(x <= constants::mediumpi())) { B fn = nearbyint(x * constants::twoopi()); detail::reassociation_barrier(fn, "multi-term range reduction"); B r = x - fn * constants::pio2_1(); detail::reassociation_barrier(r, "multi-term range reduction"); B w = fn * constants::pio2_1t(); B t = r; w = fn * constants::pio2_2(); r = t - w; detail::reassociation_barrier(r, "multi-term range reduction"); w = fn * constants::pio2_2t() - ((t - r) - w); t = r; w = fn * constants::pio2_3(); r = t - w; detail::reassociation_barrier(r, "multi-term range reduction"); w = fn * constants::pio2_3t() - ((t - r) - w); xr = r - w; detail::reassociation_barrier(xr, "multi-term range reduction"); return quadrant(fn); } else { static constexpr std::size_t size = B::size; using value_type = typename B::value_type; alignas(B) std::array tmp; alignas(B) std::array txr; alignas(B) std::array args; x.store_aligned(args.data()); for (std::size_t i = 0; i < size; ++i) { double arg = args[i]; #ifndef __FAST_MATH__ if (arg == std::numeric_limits::infinity()) { tmp[i] = 0.; txr[i] = std::numeric_limits::quiet_NaN(); } else #endif { double y[2]; std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y); tmp[i] = value_type(n & 3); txr[i] = value_type(y[0]); } } xr = B::load_aligned(&txr[0]); B res = B::load_aligned(&tmp[0]); return res; } } }; template struct trigo_reducer { static XSIMD_INLINE B reduce(const B& x, B& xr) noexcept { B xi = nearbyint(x * B(2.)); B x2 = x - xi * B(0.5); xr = x2 * constants::pi(); return quadrant(xi); } }; } template XSIMD_INLINE batch cos(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type z = xr * xr; const batch_type se = detail::sin_eval(z, xr); const batch_type ce = detail::cos_eval(z); const batch_type z1 = select(swap_bit != batch_type(0.), se, ce); return z1 ^ sign_bit; } template XSIMD_INLINE batch, A> cos(batch, A> const& z, requires_arch) noexcept { return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) }; } // cosh /* origin: boost/simd/arch/common/simd/function/cosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch cosh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto test1 = x > (constants::maxlog() - constants::log_2()); batch_type fac = select(test1, batch_type(0.5), batch_type(1.)); batch_type tmp = exp(x * fac); batch_type tmp1 = batch_type(0.5) * tmp; return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp)); } template XSIMD_INLINE batch, A> cosh(const batch, A>& z, requires_arch) noexcept { auto x = z.real(); auto y = z.imag(); return { cosh(x) * cos(y), sinh(x) * sin(y) }; } // sin namespace detail { template XSIMD_INLINE batch sin(batch const& self, Tag = Tag()) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type z = xr * xr; const batch_type se = detail::sin_eval(z, xr); const batch_type ce = detail::cos_eval(z); const batch_type z1 = select(swap_bit == batch_type(0.), se, ce); return z1 ^ sign_bit; } } template XSIMD_INLINE batch sin(batch const& self, requires_arch) noexcept { return detail::sin(self); } template XSIMD_INLINE batch, A> sin(batch, A> const& z, requires_arch) noexcept { return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) }; } // sincos template XSIMD_INLINE std::pair, batch> sincos(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); const batch_type z = xr * xr; const batch_type se = detail::sin_eval(z, xr); const batch_type ce = detail::cos_eval(z); auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce); auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce); return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit); } template XSIMD_INLINE std::pair, A>, batch, A>> sincos(batch, A> const& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch rcos = cos(z.real()); real_batch rsin = sin(z.real()); real_batch icosh = cosh(z.imag()); real_batch isinh = sinh(z.imag()); return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh)); } // sinh namespace detail { /* origin: boost/simd/arch/common/detail/common/sinh_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch sinh_kernel(batch const& self) noexcept { using batch_type = batch; batch_type sqr_self = self * self; return detail::horner(sqr_self) * self; } template XSIMD_INLINE batch sinh_kernel(batch const& self) noexcept { using batch_type = batch; batch_type sqrself = self * self; return fma(self, (detail::horner(sqrself) / detail::horner1(sqrself)) * sqrself, self); } } /* origin: boost/simd/arch/common/simd/function/sinh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch sinh(batch const& a, requires_arch) noexcept { using batch_type = batch; batch_type half(0.5); batch_type x = abs(a); auto lt1 = x < batch_type(1.); batch_type bts = bitofsign(a); batch_type z(0.); if (any(lt1)) { z = detail::sinh_kernel(x); if (all(lt1)) return z ^ bts; } auto test1 = x > (constants::maxlog() - constants::log_2()); batch_type fac = select(test1, half, batch_type(1.)); batch_type tmp = exp(x * fac); batch_type tmp1 = half * tmp; batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp); return select(lt1, z, r) ^ bts; } template XSIMD_INLINE batch, A> sinh(const batch, A>& z, requires_arch) noexcept { auto x = z.real(); auto y = z.imag(); return { sinh(x) * cos(y), cosh(x) * sin(y) }; } // tan template XSIMD_INLINE batch tan(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); auto test = (swap_bit == batch_type(0.)); const batch_type y = detail::tan_eval(xr, test); return y ^ bitofsign(self); } template XSIMD_INLINE batch, A> tan(batch, A> const& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch d = cos(2 * z.real()) + cosh(2 * z.imag()); real_batch wreal = sin(2 * z.real()) / d; real_batch wimag = sinh(2 * z.imag()); #ifdef __FAST_MATH__ return batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d); #else batch_type winf(constants::infinity(), constants::infinity()); batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d)); return select(d == real_batch(0.), winf, wres); #endif } // tanh namespace detail { /* origin: boost/simd/arch/common/detail/common/tanh_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct tanh_kernel; template struct tanh_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type tanh(const batch_type& x) noexcept { batch_type sqrx = x * x; return fma(detail::horner(sqrx) * sqrx, x, x); } static XSIMD_INLINE batch_type cotanh(const batch_type& x) noexcept { return batch_type(1.) / tanh(x); } }; template struct tanh_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type tanh(const batch_type& x) noexcept { batch_type sqrx = x * x; return fma(sqrx * p(sqrx) / q(sqrx), x, x); } static XSIMD_INLINE batch_type cotanh(const batch_type& x) noexcept { batch_type sqrx = x * x; batch_type qval = q(sqrx); return qval / (x * fma(p(sqrx), sqrx, qval)); } static XSIMD_INLINE batch_type p(const batch_type& x) noexcept { return detail::horner(x); } static XSIMD_INLINE batch_type q(const batch_type& x) noexcept { return detail::horner1(x); } }; } /* origin: boost/simd/arch/common/simd/function/tanh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch tanh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type one(1.); batch_type x = abs(self); auto test = x < (batch_type(5.) / batch_type(8.)); batch_type bts = bitofsign(self); batch_type z = one; if (any(test)) { z = detail::tanh_kernel::tanh(x); if (all(test)) return z ^ bts; } batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one); return select(test, z, r) ^ bts; } template XSIMD_INLINE batch, A> tanh(const batch, A>& z, requires_arch) noexcept { using real_batch = typename batch, A>::real_batch; auto x = z.real(); auto y = z.imag(); real_batch two(2); auto d = cosh(two * x) + cos(two * y); return { sinh(two * x) / d, sin(two * y) / d }; } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/utils/000077500000000000000000000000001517435117100225535ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/include/xsimd/arch/utils/shifts.hpp000066400000000000000000000067431517435117100245760ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * Copyright (c) Marco Barbone * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_UTILS_SHIFTS_HPP #define XSIMD_UTILS_SHIFTS_HPP #include "../../config/xsimd_macros.hpp" #include "../../types/xsimd_batch.hpp" #include "../../types/xsimd_batch_constant.hpp" #include "../../types/xsimd_traits.hpp" namespace xsimd { namespace kernel { namespace utils { template struct select_stride { template static constexpr K get(K i, K) { constexpr I values_array[] = { Vs... }; return static_cast(values_array[length * i + offset]); } }; template constexpr I lsb_mask(I bit_index) { if (bit_index == 8 * sizeof(I)) { return ~I { 0 }; } return static_cast((I { 1 } << bit_index) - I { 1 }); } template constexpr bool all_equals(batch_constant c) { return (c == std::integral_constant {}).all(); } template XSIMD_INLINE batch bitwise_lshift_as_twice_larger( batch const& self, batch_constant) noexcept { using T2 = widen_t; const auto self2 = bitwise_cast(self); // Lower byte: shift as twice the size and mask bits flowing to higher byte. constexpr auto shifts_lo = make_batch_constant, A>(); constexpr auto mask_lo = lsb_mask(8 * sizeof(T)); const auto shifted_lo = bitwise_lshift(self2, shifts_lo); constexpr auto batch_mask_lo = make_batch_constant(); const auto masked_lo = bitwise_and(shifted_lo, batch_mask_lo.as_batch()); // Higher byte: mask bits that would flow from lower byte and shift as twice the size. constexpr auto shifts_hi = make_batch_constant, A>(); constexpr auto mask_hi = mask_lo << (8 * sizeof(T)); constexpr auto batch_mask_hi = make_batch_constant(); const auto masked_hi = bitwise_and(self2, batch_mask_hi.as_batch()); const auto shifted_hi = bitwise_lshift(masked_hi, shifts_hi); return bitwise_cast(bitwise_or(masked_lo, shifted_hi)); } } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx.hpp000066400000000000000000003005041517435117100241300ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * Copyright (c) Marco Barbone * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_HPP #define XSIMD_AVX_HPP #include #include #include #include "../types/xsimd_avx_register.hpp" #include "../types/xsimd_batch_constant.hpp" namespace xsimd { namespace kernel { using namespace types; // fwd template XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept; template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; namespace detail { XSIMD_INLINE __m128i lower_half(__m256i self) noexcept { return _mm256_castsi256_si128(self); } XSIMD_INLINE __m128 lower_half(__m256 self) noexcept { return _mm256_castps256_ps128(self); } XSIMD_INLINE __m128d lower_half(__m256d self) noexcept { return _mm256_castpd256_pd128(self); } XSIMD_INLINE __m128i upper_half(__m256i self) noexcept { return _mm256_extractf128_si256(self, 1); } XSIMD_INLINE __m128 upper_half(__m256 self) noexcept { return _mm256_extractf128_ps(self, 1); } XSIMD_INLINE __m128d upper_half(__m256d self) noexcept { return _mm256_extractf128_pd(self, 1); } XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept { return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1); } XSIMD_INLINE __m256 merge_sse(__m128 low, __m128 high) noexcept { return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1); } XSIMD_INLINE __m256d merge_sse(__m128d low, __m128d high) noexcept { return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1); } template XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept { __m128i self_low = lower_half(self), self_high = upper_half(self); __m128i res_low = f(self_low); __m128i res_high = f(self_high); return merge_sse(res_low, res_high); } template XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept { __m128i self_low = lower_half(self), self_high = upper_half(self), other_low = lower_half(other), other_high = upper_half(other); __m128i res_low = f(self_low, other_low); __m128i res_high = f(self_high, other_high); return merge_sse(res_low, res_high); } template XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept { __m128i self_low = lower_half(self), self_high = upper_half(self); __m128i res_low = f(self_low, other); __m128i res_high = f(self_high, other); return merge_sse(res_low, res_high); } } // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31 return _mm256_andnot_ps(sign_mask, self); } template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31 return _mm256_andnot_pd(sign_mask, self); } // add template ::value>> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return add(batch(s), batch(o)); }, self, other); } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_add_ps(self, other); } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_add_pd(self, other); } // all template XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm256_testc_ps(self, batch_bool(true)) != 0; } template XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm256_testc_pd(self, batch_bool(true)) != 0; } template ::value>> XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm256_testc_si256(self, batch_bool(true)) != 0; } // any template XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return !_mm256_testz_ps(self, self); } template XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return !_mm256_testz_pd(self, self); } template ::value>> XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return !_mm256_testz_si256(self, self); } // batch_bool_cast template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return { bitwise_cast(batch(self.data)).data }; } // bitwise_and template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_and_ps(self, other); } template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_and_pd(self, other); } template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_and_ps(self, other); } template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_and_pd(self, other); } template ::value>> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_and(batch(s), batch(o)); }, self, other); } template ::value>> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_and(batch(s), batch(o)); }, self, other); } // bitwise_andnot template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_andnot_ps(other, self); } template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_andnot_pd(other, self); } template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_andnot_ps(other, self); } template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_andnot_pd(other, self); } template ::value>> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_andnot(batch(s), batch(o)); }, self, other); } template ::value>> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_andnot(batch(s), batch(o)); }, self, other); } // bitwise_lshift template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept { return bitwise_lshift(batch(s), o, sse4_2 {}); }, self, other); } // bitwise_not template ::value>> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s) noexcept { return bitwise_not(batch(s), sse4_2 {}); }, self); } template ::value>> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s) noexcept { return bitwise_not(batch_bool(s), sse4_2 {}); }, self); } // bitwise_or template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_or_ps(self, other); } template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_or_pd(self, other); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_or_ps(self, other); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_or_pd(self, other); } template ::value>> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_or(batch(s), batch(o)); }, self, other); } template ::value>> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_or(batch_bool(s), batch_bool(o)); }, self, other); } // bitwise_rshift template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept { return bitwise_rshift(batch(s), o, sse4_2 {}); }, self, other); } // bitwise_xor template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_xor_ps(self, other); } template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_xor_pd(self, other); } template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_ps(self, other); } template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_pd(self, other); } template ::value>> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_xor(batch(s), batch(o), sse4_2 {}); }, self, other); } template ::value>> XSIMD_INLINE batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_xor(batch_bool(s), batch_bool(o), sse4_2 {}); }, self, other); } // bitwise_cast template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castsi256_ps(self); } template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castsi256_pd(self); } template >::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return batch(self.data); } template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castps_pd(self); } template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castps_si256(self); } template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castpd_ps(self); } template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castpd_si256(self); } // bitwise_not template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); } template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); } template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); } template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); } // broadcast template ::value>> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_set1_epi8(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_set1_epi16(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_set1_epi32(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_set1_epi64x(val); } else { assert(false && "unsupported"); return {}; } } template XSIMD_INLINE batch broadcast(float val, requires_arch) noexcept { return _mm256_set1_ps(val); } template XSIMD_INLINE batch broadcast(double val, requires_arch) noexcept { return _mm256_set1_pd(val); } // ceil template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return _mm256_ceil_ps(self); } template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return _mm256_ceil_pd(self); } namespace detail { // On clang, _mm256_extractf128_ps is built upon build_shufflevector // which require index parameter to be a constant template XSIMD_INLINE B get_half_complex_f(const B& real, const B& imag) noexcept { __m128 tmp0 = _mm256_extractf128_ps(real, index); __m128 tmp1 = _mm256_extractf128_ps(imag, index); __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1); tmp0 = _mm_unpacklo_ps(tmp0, tmp1); __m256 res = real; res = _mm256_insertf128_ps(res, tmp0, 0); res = _mm256_insertf128_ps(res, tmp2, 1); return res; } template XSIMD_INLINE B get_half_complex_d(const B& real, const B& imag) noexcept { __m128d tmp0 = _mm256_extractf128_pd(real, index); __m128d tmp1 = _mm256_extractf128_pd(imag, index); __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1); tmp0 = _mm_unpacklo_pd(tmp0, tmp1); __m256d res = real; res = _mm256_insertf128_pd(res, tmp0, 0); res = _mm256_insertf128_pd(res, tmp2, 1); return res; } // complex_low template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return get_half_complex_f<0>(self.real(), self.imag()); } template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return get_half_complex_d<0>(self.real(), self.imag()); } // complex_high template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return get_half_complex_f<1>(self.real(), self.imag()); } template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return get_half_complex_d<1>(self.real(), self.imag()); } } // fast_cast namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_cvtepi32_ps(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_cvttps_epi32(self); } } // decr_if template ::value>> XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self + batch(mask.data); } // div template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_div_ps(self, other); } template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_div_pd(self, other); } // eq template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_EQ_OQ); } template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_EQ_OQ); } template XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self != other); } template XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self != other); } template ::value>> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return eq(batch(s), batch(o), sse4_2 {}); }, self, other); } template ::value>> XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self != other); } // floor template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return _mm256_floor_ps(self); } template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return _mm256_floor_pd(self); } // from_mask template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut32[] = { 0x0000000000000000ul, 0x00000000FFFFFFFFul, 0xFFFFFFFF00000000ul, 0xFFFFFFFFFFFFFFFFul, }; assert(!(mask & ~0xFFul) && "inbound mask"); return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6])); } template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut64[][4] = { { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, }; assert(!(mask & ~0xFul) && "inbound mask"); return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask])); } template ::value>> XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { alignas(A::alignment()) static const uint32_t lut32[] = { 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF, }; assert(!(mask & ~0xFFFFFFFFul) && "inbound mask"); return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF], lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF], lut32[(mask >> 24) & 0xF], lut32[mask >> 28]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { alignas(A::alignment()) static const uint64_t lut64[] = { 0x0000000000000000ul, 0x000000000000FFFFul, 0x00000000FFFF0000ul, 0x00000000FFFFFFFFul, 0x0000FFFF00000000ul, 0x0000FFFF0000FFFFul, 0x0000FFFFFFFF0000ul, 0x0000FFFFFFFFFFFFul, 0xFFFF000000000000ul, 0xFFFF00000000FFFFul, 0xFFFF0000FFFF0000ul, 0xFFFF0000FFFFFFFFul, 0xFFFFFFFF00000000ul, 0xFFFFFFFF0000FFFFul, 0xFFFFFFFFFFFF0000ul, 0xFFFFFFFFFFFFFFFFul, }; assert(!(mask & ~0xFFFFul) && "inbound mask"); return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_castps_si256(from_mask(batch_bool {}, mask, avx {})); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_castpd_si256(from_mask(batch_bool {}, mask, avx {})); } } // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { // row = (a,b,c,d,e,f,g,h) // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7) __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]); // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7) __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]); // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7) tmp1 = _mm256_hadd_ps(tmp0, tmp1); // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7) tmp0 = _mm256_hadd_ps(row[4], row[5]); // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7) __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]); // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3, // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) tmp2 = _mm256_hadd_ps(tmp0, tmp2); // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000); // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7, // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3) tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21); return _mm256_add_ps(tmp0, tmp1); } template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { // row = (a,b,c,d) // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3) __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]); // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3) __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]); // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3) __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100); // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3) tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21); return _mm256_add_pd(tmp1, tmp2); } // incr_if template ::value>> XSIMD_INLINE batch incr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self - batch(mask.data); } // first (must precede get for two-phase lookup) template XSIMD_INLINE float first(batch const& self, requires_arch) noexcept { return _mm256_cvtss_f32(self); } template XSIMD_INLINE double first(batch const& self, requires_arch) noexcept { return _mm256_cvtsd_f64(self); } template ::value>> XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return static_cast(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFF); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return static_cast(_mm_cvtsi128_si32(_mm256_castsi256_si128(self)) & 0xFFFF); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return static_cast(_mm_cvtsi128_si32(_mm256_castsi256_si128(self))); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { batch low = _mm256_castsi256_si128(self); return first(low, sse4_2 {}); } else { assert(false && "unsupported arch/op combination"); return {}; } } // get template XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); } constexpr size_t elements_per_lane = batch::size; constexpr size_t lane = I / elements_per_lane; constexpr size_t sub_index = I % elements_per_lane; const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); } template XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); } constexpr size_t elements_per_lane = batch::size; constexpr size_t lane = I / elements_per_lane; constexpr size_t sub_index = I % elements_per_lane; const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); } template ::value>> XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx {}); } constexpr size_t elements_per_lane = batch::size; constexpr size_t lane = I / elements_per_lane; constexpr size_t sub_index = I % elements_per_lane; const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); return kernel::get(batch(half), ::xsimd::index {}, sse4_1 {}); } // insert template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { #if !defined(_MSC_VER) || _MSC_VER > 1900 XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_insert_epi8(self, val, I); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_insert_epi16(self, val, I); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_insert_epi32(self, val, I); } else { return insert(self, val, pos, common {}); } #endif return insert(self, val, pos, common {}); } // isnan template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm256_cmp_ps(self, self, _CMP_UNORD_Q); } template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm256_cmp_pd(self, self, _CMP_UNORD_Q); } // le template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_LE_OQ); } template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_LE_OQ); } // load_aligned template ::value>> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept { return _mm256_load_si256((__m256i const*)mem); } template XSIMD_INLINE batch load_aligned(float const* mem, convert, requires_arch) noexcept { return _mm256_load_ps(mem); } template XSIMD_INLINE batch load_aligned(double const* mem, convert, requires_arch) noexcept { return _mm256_load_pd(mem); } namespace detail { // load_complex template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; __m128 tmp0 = _mm256_extractf128_ps(hi, 0); __m128 tmp1 = _mm256_extractf128_ps(hi, 1); __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); batch_type real = _mm256_castps128_ps256(tmp_real); batch_type imag = _mm256_castps128_ps256(tmp_imag); tmp0 = _mm256_extractf128_ps(lo, 0); tmp1 = _mm256_extractf128_ps(lo, 1); tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); real = _mm256_insertf128_ps(real, tmp_real, 1); imag = _mm256_insertf128_ps(imag, tmp_imag, 1); return { real, imag }; } template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; __m128d tmp0 = _mm256_extractf128_pd(hi, 0); __m128d tmp1 = _mm256_extractf128_pd(hi, 1); batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1)); batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1)); tmp0 = _mm256_extractf128_pd(lo, 0); tmp1 = _mm256_extractf128_pd(lo, 1); __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1); __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1); real = _mm256_blend_pd(real, re_tmp1, 12); imag = _mm256_blend_pd(imag, im_tmp1, 12); return { real, imag }; } } // load_unaligned template ::value>> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return _mm256_loadu_si256((__m256i const*)mem); } template XSIMD_INLINE batch load_unaligned(float const* mem, convert, requires_arch) noexcept { return _mm256_loadu_ps(mem); } template XSIMD_INLINE batch load_unaligned(double const* mem, convert, requires_arch) noexcept { return _mm256_loadu_pd(mem); } // AVX helpers to avoid type-based branching in the generic load_masked namespace detail { template XSIMD_INLINE batch maskload(float const* mem, batch, A> const& mask) noexcept { return _mm256_maskload_ps(mem, mask); } template XSIMD_INLINE batch maskload(double const* mem, batch, A> const& mask) noexcept { return _mm256_maskload_pd(mem, mask); } template XSIMD_INLINE batch zero_extend(batch const& hi) noexcept { return _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 1); } template XSIMD_INLINE batch zero_extend(batch const& hi) noexcept { return _mm256_insertf128_pd(_mm256_setzero_pd(), hi, 1); } // allow inserting a 128-bit SSE batch into the upper half of an AVX batch template XSIMD_INLINE batch zero_extend(batch const& hi) noexcept { return _mm256_insertf128_ps(_mm256_setzero_ps(), hi, 1); } template XSIMD_INLINE batch zero_extend(batch const& hi) noexcept { return _mm256_insertf128_pd(_mm256_setzero_pd(), hi, 1); } } // load_masked (single overload for float/double) template ::value>> XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { using int_t = as_integer_t; constexpr size_t half_size = batch::size / 2; // confined to lower 128-bit half → forward to SSE2 XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { constexpr auto mlo = ::xsimd::detail::lower_half(batch_bool_constant {}); const auto lo = load_masked(reinterpret_cast(mem), mlo, convert {}, Mode {}, sse4_2 {}); return bitwise_cast(batch(_mm256_zextsi128_si256(lo))); } // confined to upper 128-bit half → forward to SSE2 else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); const auto hi = load_masked(mem + half_size, mhi, convert {}, Mode {}, sse4_2 {}); return detail::zero_extend(hi); } else { // crossing 128-bit boundary → use 256-bit masked load return detail::maskload(mem, mask.as_batch()); } } // store_masked namespace detail { template XSIMD_INLINE void maskstore(float* mem, batch_bool const& mask, batch const& src) noexcept { _mm256_maskstore_ps(mem, mask, src); } template XSIMD_INLINE void maskstore(double* mem, batch_bool const& mask, batch const& src) noexcept { _mm256_maskstore_pd(mem, mask, src); } } template XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t half_size = batch::size / 2; // confined to lower 128-bit half → forward to SSE2 XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half_size) { constexpr auto mlo = ::xsimd::detail::lower_half(mask); const auto lo = detail::lower_half(src); store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); } // confined to upper 128-bit half → forward to SSE2 else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half_size) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); const auto hi = detail::upper_half(src); store_masked(mem + half_size, hi, mhi, Mode {}, sse4_2 {}); } else { detail::maskstore(mem, mask.as_batch(), src); } } // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_LT_OQ); } template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_LT_OQ); } template ::value>> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return lt(batch(s), batch(o)); }, self, other); } // mask template ::value>> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) { __m128i self_low = detail::lower_half(self), self_high = detail::upper_half(self); return mask(batch_bool(self_low), sse4_2 {}) | (mask(batch_bool(self_high), sse4_2 {}) << (128 / (8 * sizeof(T)))); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_movemask_ps(_mm256_castsi256_ps(self)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_movemask_pd(_mm256_castsi256_pd(self)); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return _mm256_movemask_ps(self); } template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return _mm256_movemask_pd(self); } // max template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_max_ps(other, self); } template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_max_pd(other, self); } template ::value>> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return select(self > other, self, other); } // min template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_min_ps(other, self); } template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_min_pd(other, self); } template ::value>> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return select(self <= other, self, other); } // mul template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_mul_ps(self, other); } template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_mul_pd(self, other); } // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT); } template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT); } // nearbyint_as_int template XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return _mm256_cvtps_epi32(self); } // neg template ::value>> XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return 0 - self; } template batch neg(batch const& self, requires_arch) { return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } template XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000))); } // neq template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ); } template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ); } template ::value>> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~(self == other); } template XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_ps(self, other); } template XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_pd(self, other); } template ::value>> XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data))); } // reciprocal template XSIMD_INLINE batch reciprocal(batch const& self, kernel::requires_arch) noexcept { return _mm256_rcp_ps(self); } // reduce_add template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { typename batch::register_type low = detail::lower_half(self), high = detail::upper_half(self); batch blow(low), bhigh(high); return reduce_add(blow + bhigh); } // reduce_max template > XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept { constexpr auto mask = detail::shuffle(1, 0); batch step = _mm256_permute2f128_si256(self, self, mask); batch acc = max(self, step); __m128i low = _mm256_castsi256_si128(acc); return reduce_max(batch(low)); } // reduce_min template > XSIMD_INLINE T reduce_min(batch const& self, requires_arch) noexcept { constexpr auto mask = detail::shuffle(1, 0); batch step = _mm256_permute2f128_si256(self, self, mask); batch acc = min(self, step); __m128i low = _mm256_castsi256_si128(acc); return reduce_min(batch(low)); } // reduce_mul template ::value>> XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept { typename batch::register_type low = detail::lower_half(self), high = detail::upper_half(self); batch blow(low), bhigh(high); return reduce_mul(blow * bhigh); } // rsqrt template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm256_rsqrt_ps(val); } template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val))); } // sadd template ::value>> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { auto mask = (other >> (8 * sizeof(T) - 1)); auto self_pos_branch = min(std::numeric_limits::max() - other, self); auto self_neg_branch = max(std::numeric_limits::min() - other, self); return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); } else { const auto diffmax = std::numeric_limits::max() - self; const auto mindiff = min(diffmax, other); return self + mindiff; } } // select template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm256_blendv_ps(false_br, true_br, cond); } template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm256_blendv_pd(false_br, true_br, cond); } template ::value>> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { __m128i cond_low = detail::lower_half(cond), cond_hi = detail::upper_half(cond); __m128i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br); __m128i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br); __m128i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), sse4_2 {}); __m128i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), sse4_2 {}); return detail::merge_sse(res_low, res_hi); } template ::value>> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, avx2 {}); } template XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { constexpr auto mask = batch_bool_constant::mask(); return _mm256_blend_ps(false_br, true_br, mask); } template XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { constexpr auto mask = batch_bool_constant::mask(); return _mm256_blend_pd(false_br, true_br, mask); } // set template XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return _mm256_setr_ps(values...); } template XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return _mm256_setr_pd(values...); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept { return _mm256_set_epi64x(v3, v2, v1, v0); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept { return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept { return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept { return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); } template ::value>> XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return _mm256_castsi256_ps(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return _mm256_castsi256_pd(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); } // shuffle template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3); // shuffle within lane if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I0 < 4 && I1 < 4 && I2 >= 8 && I2 < 12 && I3 >= 8 && I3 < 12) return _mm256_shuffle_ps(x, y, smask); // shuffle within opposite lane if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I2 < 4 && I3 < 4 && I0 >= 8 && I0 < 12 && I1 >= 8 && I1 < 12) return _mm256_shuffle_ps(y, x, smask); return shuffle(x, y, mask, common {}); } template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3); // shuffle within lane if (I0 < 2 && I1 >= 4 && I1 < 6 && I2 >= 2 && I2 < 4 && I3 >= 6) return _mm256_shuffle_pd(x, y, smask); // shuffle within opposite lane if (I1 < 2 && I0 >= 4 && I0 < 6 && I3 >= 2 && I3 < 4 && I2 >= 6) return _mm256_shuffle_pd(y, x, smask); return shuffle(x, y, mask, common {}); } // slide_left template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; __m128i low = _mm256_castsi256_si128(x); auto y = _mm_slli_si128(low, M); __m256i zero = _mm256_setzero_si256(); return _mm256_insertf128_si256(zero, y, 1); } if (BitCount == 128) { __m128i low = _mm256_castsi256_si128(x); __m256i zero = _mm256_setzero_si256(); return _mm256_insertf128_si256(zero, low, 1); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; __m128i low = _mm256_castsi256_si128(x); auto ylow = _mm_slli_si128(low, M); auto zlow = _mm_srli_si128(low, 16 - M); __m128i high = _mm256_extractf128_si256(x, 1); auto yhigh = _mm_slli_si128(high, M); __m256i res = _mm256_castsi128_si256(ylow); return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1); } // slide_right template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; __m128i high = _mm256_extractf128_si256(x, 1); __m128i y = _mm_srli_si128(high, M); __m256i zero = _mm256_setzero_si256(); return _mm256_insertf128_si256(zero, y, 0); } if (BitCount == 128) { __m128i high = _mm256_extractf128_si256(x, 1); return _mm256_castsi128_si256(high); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; __m128i low = _mm256_castsi256_si128(x); auto ylow = _mm_srli_si128(low, M); __m128i high = _mm256_extractf128_si256(x, 1); auto yhigh = _mm_srli_si128(high, M); auto zhigh = _mm_slli_si128(high, 16 - M); __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh)); return _mm256_insertf128_si256(res, yhigh, 1); } // sqrt template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm256_sqrt_ps(val); } template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm256_sqrt_pd(val); } // ssub template ::value>> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { return sadd(self, -other); } else { const auto diff = min(self, other); return self - diff; } } // store_aligned template ::value>> XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept { return _mm256_store_si256((__m256i*)mem, self); } template ::value>> XSIMD_INLINE void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm256_store_si256((__m256i*)mem, self); } template XSIMD_INLINE void store_aligned(float* mem, batch const& self, requires_arch) noexcept { return _mm256_store_ps(mem, self); } template XSIMD_INLINE void store_aligned(double* mem, batch const& self, requires_arch) noexcept { return _mm256_store_pd(mem, self); } // store_unaligned template ::value>> XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { return _mm256_storeu_si256((__m256i*)mem, self); } template ::value>> XSIMD_INLINE void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm256_storeu_si256((__m256i*)mem, self); } template XSIMD_INLINE void store_unaligned(float* mem, batch const& self, requires_arch) noexcept { return _mm256_storeu_ps(mem, self); } template XSIMD_INLINE void store_unaligned(double* mem, batch const& self, requires_arch) noexcept { return _mm256_storeu_pd(mem, self); } // store_stream template XSIMD_INLINE void store_stream(float* mem, batch const& self, requires_arch) noexcept { _mm256_stream_ps(mem, self); } template XSIMD_INLINE void store_stream(double* mem, batch const& self, requires_arch) noexcept { _mm256_stream_pd(mem, self); } template ::value, void>> XSIMD_INLINE void store_stream(T* mem, batch const& self, requires_arch) noexcept { _mm256_stream_si256((__m256i*)mem, self); } // sub template ::value>> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return sub(batch(s), batch(o)); }, self, other); } template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_sub_ps(self, other); } template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_sub_pd(self, other); } // swizzle (dynamic mask) template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { // swap lanes __m256 swapped = _mm256_permute2f128_ps(self, self, 0x01); // [high | low] // normalize mask taking modulo 4 batch half_mask = mask & 0b11u; // permute within each lane __m256 r0 = _mm256_permutevar_ps(self, half_mask); __m256 r1 = _mm256_permutevar_ps(swapped, half_mask); // select lane by the mask index divided by 4 constexpr auto lane = batch_constant {}; batch_bool blend_mask = (mask & 0b100u) != lane; return _mm256_blendv_ps(r0, r1, batch_bool_cast(blend_mask)); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { // swap lanes __m256d swapped = _mm256_permute2f128_pd(self, self, 0x01); // [high | low] // The half mask value is found in mask modulo 2, but the intrinsic expect it in the // second least significant bit. We use negative as a cheap alternative to lshift. batch half_mask = -(mask & 0b1u); // permute within each lane __m256d r0 = _mm256_permutevar_pd(self, half_mask); __m256d r1 = _mm256_permutevar_pd(swapped, half_mask); // select lane by the mask index divided by 2 constexpr auto lane = batch_constant {}; batch_bool blend_mask = (mask & 0b10u) != lane; return _mm256_blendv_pd(r0, r1, batch_bool_cast(blend_mask)); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept { return bitwise_cast( swizzle(bitwise_cast(self), mask)); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept { return bitwise_cast( swizzle(bitwise_cast(self), mask)); } // swizzle (constant mask) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { constexpr bool is_identity = detail::is_identity(mask); constexpr bool is_dup_low = detail::is_dup_lo(mask); constexpr bool is_dup_hi = detail::is_dup_hi(mask); constexpr bool is_dup = is_dup_low || is_dup_hi; XSIMD_IF_CONSTEXPR(is_identity) { return self; } XSIMD_IF_CONSTEXPR(is_dup) { constexpr auto control = is_dup_low ? 0x00 : 0x11; constexpr auto is_dup_identity = is_dup_low ? detail::is_identity() : detail::is_identity(); auto split = _mm256_permute2f128_ps(self, self, control); XSIMD_IF_CONSTEXPR(!is_dup_identity) { constexpr auto shuffle_mask = is_dup_low ? detail::mod_shuffle(V0, V1, V2, V3) : detail::mod_shuffle(V4 - 4, V5 - 4, V6 - 4, V7 - 4); split = _mm256_permute_ps(split, shuffle_mask); } return split; } constexpr auto lane_mask = mask % std::integral_constant(); XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask)) { __m256 broadcast = _mm256_permute2f128_ps(self, self, 0x00); // [low | low] return _mm256_permutevar_ps(broadcast, lane_mask.as_batch()); } XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask)) { __m256 broadcast = _mm256_permute2f128_ps(self, self, 0x11); // [high | high] return _mm256_permutevar_ps(broadcast, lane_mask.as_batch()); } // Fallback to general algorithm. This is the same as the dynamic version with the exception // that possible operations are done at compile time. // swap lanes __m256 swapped = _mm256_permute2f128_ps(self, self, 0x01); // [high | low] // normalize mask taking modulo 4 constexpr auto half_mask = mask % std::integral_constant(); // permute within each lane __m256 r0 = _mm256_permutevar_ps(self, half_mask.as_batch()); __m256 r1 = _mm256_permutevar_ps(swapped, half_mask.as_batch()); // select lane by the mask index divided by 4 constexpr auto lane = batch_constant {}; constexpr int lane_idx = ((mask / std::integral_constant()) != lane).mask(); return _mm256_blend_ps(r0, r1, lane_idx); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { // cannot use detail::mod_shuffle as the mod and shift are different in this case constexpr auto imm = ((V0 % 2) << 0) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3); XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; } XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask)) { return _mm256_permute_pd(self, imm); } XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask)) { __m256d broadcast = _mm256_permute2f128_pd(self, self, 0x00); // [low | low] return _mm256_permute_pd(broadcast, imm); } XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask)) { __m256d broadcast = _mm256_permute2f128_pd(self, self, 0x11); // [high | high] return _mm256_permute_pd(broadcast, imm); } // Fallback to general algorithm. This is the same as the dynamic version with the exception // that possible operations are done at compile time. // swap lanes __m256d swapped = _mm256_permute2f128_pd(self, self, 0x01); // [high | low] // permute within each lane __m256d r0 = _mm256_permute_pd(self, imm); __m256d r1 = _mm256_permute_pd(swapped, imm); // select lane by the mask index divided by 2 constexpr auto lane = batch_constant {}; constexpr int lane_idx = ((mask / std::integral_constant()) != lane).mask(); // blend the two permutes return _mm256_blend_pd(r0, r1, lane_idx); } template < class A, typename T, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7, detail::enable_sized_integral_t = 0> XSIMD_INLINE batch swizzle( batch const& self, batch_constant const& mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask)); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask)); } // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; // See // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2 auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3], r4 = matrix_begin[4], r5 = matrix_begin[5], r6 = matrix_begin[6], r7 = matrix_begin[7]; auto t0 = _mm256_unpacklo_ps(r0, r1); auto t1 = _mm256_unpackhi_ps(r0, r1); auto t2 = _mm256_unpacklo_ps(r2, r3); auto t3 = _mm256_unpackhi_ps(r2, r3); auto t4 = _mm256_unpacklo_ps(r4, r5); auto t5 = _mm256_unpackhi_ps(r4, r5); auto t6 = _mm256_unpacklo_ps(r6, r7); auto t7 = _mm256_unpackhi_ps(r6, r7); r0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0)); r1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2)); r2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); r3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); r4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0)); r5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2)); r6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); r7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); matrix_begin[0] = _mm256_permute2f128_ps(r0, r4, 0x20); matrix_begin[1] = _mm256_permute2f128_ps(r1, r5, 0x20); matrix_begin[2] = _mm256_permute2f128_ps(r2, r6, 0x20); matrix_begin[3] = _mm256_permute2f128_ps(r3, r7, 0x20); matrix_begin[4] = _mm256_permute2f128_ps(r0, r4, 0x31); matrix_begin[5] = _mm256_permute2f128_ps(r1, r5, 0x31); matrix_begin[6] = _mm256_permute2f128_ps(r2, r6, 0x31); matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; auto t0 = _mm256_unpacklo_pd(r0, r1); // r00 r10 r01 r11 auto t1 = _mm256_unpackhi_pd(r0, r1); // r02 r12 r03 r13 auto t2 = _mm256_unpacklo_pd(r2, r3); // r20 r30 r21 r31 auto t3 = _mm256_unpackhi_pd(r2, r3); // r22 r32 r23 r33 matrix_begin[0] = _mm256_permute2f128_pd(t0, t2, 0x20); matrix_begin[1] = _mm256_permute2f128_pd(t1, t3, 0x20); matrix_begin[2] = _mm256_permute2f128_pd(t0, t2, 0x31); matrix_begin[3] = _mm256_permute2f128_pd(t1, t3, 0x31); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; batch tmp_lo0[8]; for (int i = 0; i < 8; ++i) tmp_lo0[i] = _mm256_castsi256_si128(matrix_begin[i]); transpose(tmp_lo0 + 0, tmp_lo0 + 8, sse4_2 {}); batch tmp_hi0[8]; for (int i = 0; i < 8; ++i) tmp_hi0[i] = _mm256_castsi256_si128(matrix_begin[8 + i]); transpose(tmp_hi0 + 0, tmp_hi0 + 8, sse4_2 {}); batch tmp_lo1[8]; for (int i = 0; i < 8; ++i) tmp_lo1[i] = _mm256_extractf128_si256(matrix_begin[i], 1); transpose(tmp_lo1 + 0, tmp_lo1 + 8, sse4_2 {}); batch tmp_hi1[8]; for (int i = 0; i < 8; ++i) tmp_hi1[i] = _mm256_extractf128_si256(matrix_begin[8 + i], 1); transpose(tmp_hi1 + 0, tmp_hi1 + 8, sse4_2 {}); for (int i = 0; i < 8; ++i) matrix_begin[i] = detail::merge_sse(tmp_lo0[i], tmp_hi0[i]); for (int i = 0; i < 8; ++i) matrix_begin[i + 8] = detail::merge_sse(tmp_lo1[i], tmp_hi1[i]); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; batch tmp_lo0[16]; for (int i = 0; i < 16; ++i) tmp_lo0[i] = _mm256_castsi256_si128(matrix_begin[i]); transpose(tmp_lo0 + 0, tmp_lo0 + 16, sse4_2 {}); batch tmp_hi0[16]; for (int i = 0; i < 16; ++i) tmp_hi0[i] = _mm256_castsi256_si128(matrix_begin[16 + i]); transpose(tmp_hi0 + 0, tmp_hi0 + 16, sse4_2 {}); batch tmp_lo1[16]; for (int i = 0; i < 16; ++i) tmp_lo1[i] = _mm256_extractf128_si256(matrix_begin[i], 1); transpose(tmp_lo1 + 0, tmp_lo1 + 16, sse4_2 {}); batch tmp_hi1[16]; for (int i = 0; i < 16; ++i) tmp_hi1[i] = _mm256_extractf128_si256(matrix_begin[16 + i], 1); transpose(tmp_hi1 + 0, tmp_hi1 + 16, sse4_2 {}); for (int i = 0; i < 16; ++i) matrix_begin[i] = detail::merge_sse(tmp_lo0[i], tmp_hi0[i]); for (int i = 0; i < 16; ++i) matrix_begin[i + 16] = detail::merge_sse(tmp_lo1[i], tmp_hi1[i]); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } // trunc template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return _mm256_round_ps(self, _MM_FROUND_TO_ZERO); } template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return _mm256_round_pd(self, _MM_FROUND_TO_ZERO); } // zip_hi template ::value>> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) { // extract high word __m128i self_hi = _mm256_extractf128_si256(self, 1); __m128i other_hi = _mm256_extractf128_si256(other, 1); // interleave __m128i res_lo, res_hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { res_lo = _mm_unpacklo_epi8(self_hi, other_hi); res_hi = _mm_unpackhi_epi8(self_hi, other_hi); } else { res_lo = _mm_unpacklo_epi16(self_hi, other_hi); res_hi = _mm_unpackhi_epi16(self_hi, other_hi); } // fuse return _mm256_castps_si256( _mm256_insertf128_ps( _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)), _mm_castsi128_ps(res_hi), 1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31)); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_ps(self, other); auto hi = _mm256_unpackhi_ps(self, other); return _mm256_permute2f128_ps(lo, hi, 0x31); } template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_pd(self, other); auto hi = _mm256_unpackhi_pd(self, other); return _mm256_permute2f128_pd(lo, hi, 0x31); } // zip_lo template ::value>> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) { // extract low word __m128i self_lo = _mm256_extractf128_si256(self, 0); __m128i other_lo = _mm256_extractf128_si256(other, 0); // interleave __m128i res_lo, res_hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { res_lo = _mm_unpacklo_epi8(self_lo, other_lo); res_hi = _mm_unpackhi_epi8(self_lo, other_lo); } else { res_lo = _mm_unpacklo_epi16(self_lo, other_lo); res_hi = _mm_unpackhi_epi16(self_lo, other_lo); } // fuse return _mm256_castps_si256( _mm256_insertf128_ps( _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)), _mm_castsi128_ps(res_hi), 1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1)); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_ps(self, other); auto hi = _mm256_unpackhi_ps(self, other); return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1); } template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_pd(self, other); auto hi = _mm256_unpackhi_pd(self, other); return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1); } // widen template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { auto pair_lo = widen(batch(detail::lower_half(x)), sse4_2 {}); auto pair_hi = widen(batch(detail::upper_half(x)), sse4_2 {}); return { detail::merge_sse(pair_lo[0], pair_lo[1]), detail::merge_sse(pair_hi[0], pair_hi[1]) }; } template XSIMD_INLINE std::array, 2> widen(batch const& x, requires_arch) noexcept { __m256d lo = _mm256_cvtps_pd(detail::lower_half(x)); __m256d hi = _mm256_cvtps_pd(detail::upper_half(x)); return { lo, hi }; } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx2.hpp000066400000000000000000002067171517435117100242250ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX2_HPP #define XSIMD_AVX2_HPP #include #include #include "../types/xsimd_avx2_register.hpp" #include "../types/xsimd_batch_constant.hpp" #include "./utils/shifts.hpp" #include namespace xsimd { namespace kernel { using namespace types; // abs template ::value>> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_abs_epi8(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_abs_epi16(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_abs_epi32(self); } else { return abs(self, avx {}); } } return self; } // add template ::value>> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_add_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_add_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_add_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_add_epi64(self, other); } else { return add(self, other, avx {}); } } // avgr template ::value>> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_avg_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_avg_epu16(self, other); } else { return avgr(self, other, common {}); } } // avg template ::value>> XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto adj = ((self ^ other) << 7) >> 7; return avgr(self, other, A {}) - adj; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto adj = ((self ^ other) << 15) >> 15; return avgr(self, other, A {}) - adj; } else { return avg(self, other, common {}); } } // load_masked // AVX2 low-level helpers (operate on raw SIMD registers) namespace detail { XSIMD_INLINE __m256i maskload(const int32_t* mem, __m256i mask) noexcept { return _mm256_maskload_epi32(mem, mask); } XSIMD_INLINE __m256i maskload(const long long* mem, __m256i mask) noexcept { return _mm256_maskload_epi64(reinterpret_cast(mem), mask); } XSIMD_INLINE __m256i zero_extend(__m128i hi) noexcept { return _mm256_insertf128_si256(_mm256_setzero_si256(), hi, 1); } } // single templated implementation for integer masked loads (32/64-bit) template XSIMD_INLINE std::enable_if_t::value && (sizeof(T) >= 4), batch> load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { static_assert(sizeof(T) == 4 || sizeof(T) == 8, "load_masked supports only 32/64-bit integers on AVX2"); using int_t = std::conditional_t; // Use the raw register-level maskload helpers for the remaining cases. return detail::maskload(reinterpret_cast(mem), mask.as_batch()); } template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return load_masked(mem, mask, convert {}, Mode {}, avx2 {}); } template XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto r = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx2 {}); return bitwise_cast(r); } template XSIMD_INLINE batch load_masked(int64_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { return load_masked(mem, mask, convert {}, Mode {}, avx2 {}); } template XSIMD_INLINE batch load_masked(uint64_t const* mem, batch_bool_constant, convert, Mode, requires_arch) noexcept { const auto r = load_masked(reinterpret_cast(mem), batch_bool_constant {}, convert {}, Mode {}, avx2 {}); return bitwise_cast(r); } // store_masked namespace detail { template XSIMD_INLINE void maskstore(int32_t* mem, __m256i mask, __m256i src) noexcept { _mm256_maskstore_epi32(reinterpret_cast(mem), mask, src); } template XSIMD_INLINE void maskstore(int64_t* mem, __m256i mask, __m256i src) noexcept { _mm256_maskstore_epi64(reinterpret_cast(mem), mask, src); } } template XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr size_t lanes_per_half = batch::size / 2; // confined to lower 128-bit half → forward to SSE XSIMD_IF_CONSTEXPR(mask.countl_zero() >= lanes_per_half) { constexpr auto mlo = ::xsimd::detail::lower_half(mask); const auto lo = detail::lower_half(src); store_masked(mem, lo, mlo, Mode {}, sse4_2 {}); } // confined to upper 128-bit half → forward to SSE else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= lanes_per_half) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); const auto hi = detail::upper_half(src); store_masked(mem + lanes_per_half, hi, mhi, Mode {}, sse4_2 {}); } else { detail::maskstore(mem, mask.as_batch(), src); } } template XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { const auto s32 = bitwise_cast(src); store_masked(reinterpret_cast(mem), s32, mask, Mode {}, avx2 {}); } template XSIMD_INLINE void store_masked(uint64_t* mem, batch const& src, batch_bool_constant, Mode, requires_arch) noexcept { const auto s64 = bitwise_cast(src); store_masked(reinterpret_cast(mem), s64, batch_bool_constant {}, Mode {}, avx2 {}); } // load_stream template ::value, void>> XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept { return _mm256_stream_load_si256((__m256i const*)mem); } template XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept { return _mm256_castsi256_ps(_mm256_stream_load_si256((__m256i const*)mem)); } template XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept { return _mm256_castsi256_pd(_mm256_stream_load_si256((__m256i const*)mem)); } // bitwise_and template ::value>> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_and_si256(self, other); } template ::value>> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_and_si256(self, other); } // bitwise_andnot template ::value>> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_andnot_si256(other, self); } template ::value>> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_andnot_si256(other, self); } // bitwise_not template ::value>> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); } template ::value>> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); } // bitwise_lshift template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_slli_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_slli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_slli_epi64(self, other); } else { return bitwise_lshift(self, other, avx {}); } } template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(shift < bits, "Shift must be less than the number of bits in T"); XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { // 8-bit left shift via 16-bit shift + mask __m256i shifted = _mm256_slli_epi16(self, shift); // TODO(C++17): without `if constexpr` we must ensure the compile-time shift does not overflow constexpr uint8_t mask8 = static_cast(sizeof(T) == 1 ? (~0u << shift) : 0); const __m256i mask = _mm256_set1_epi8(mask8); return _mm256_and_si256(shifted, mask); } XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_slli_epi16(self, shift); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_slli_epi32(self, shift); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_slli_epi64(self, shift); } } template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_sllv_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_sllv_epi64(self, other); } else { return bitwise_lshift(self, other, avx {}); } } // bitwise_lshift multiple (constant) specific implementations. // Missing implementations are dispatched to the `batch` overload in xsimd_api. // The 1 byte constant implementation calls the 2 bytes constant version, the 2 bytes // constant version calls into the 4 bytes version which resolves to the dynamic one above. template ::value && (sizeof(T) <= 2), int> = 0> XSIMD_INLINE batch bitwise_lshift( batch const& self, batch_constant shifts, requires_arch req) noexcept { using uint_t = std::make_unsigned_t; // AVX2 only supports 16-bit shifts with a uniform bitshift value, // otherwise emulate using 32-bit shifts. XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) { return bitwise_lshift(self, req); } return bitwise_cast( utils::bitwise_lshift_as_twice_larger( bitwise_cast(self), batch_constant(Vs)...> {})); } // bitwise_or template ::value>> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_or_si256(self, other); } template ::value>> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_or_si256(self, other); } // bitwise_rshift template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF); __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self); __m256i res = _mm256_srai_epi16(self, other); return _mm256_or_si256( detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_and(batch(s), batch(o), sse4_2 {}); }, sign_mask, cmp_is_negative), _mm256_andnot_si256(sign_mask, res)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_srai_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srai_epi32(self, other); } else { return bitwise_rshift(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_srli_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_srli_epi64(self, other); } else { return bitwise_rshift(self, other, avx {}); } } } template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(shift < bits, "Shift amount must be less than the number of bits in T"); if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> shift) & 0x00FF); __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self); __m256i res = _mm256_srai_epi16(self, shift); return _mm256_or_si256( detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_and(batch(s), batch(o), sse4_2 {}); }, sign_mask, cmp_is_negative), _mm256_andnot_si256(sign_mask, res)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_srai_epi16(self, shift); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srai_epi32(self, shift); } else { return bitwise_rshift(self, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { // 8-bit left shift via 16-bit shift + mask const __m256i shifted = _mm256_srli_epi16(self, shift); // TODO(C++17): without `if constexpr` we must ensure the compile-time shift does not overflow constexpr uint8_t mask8 = static_cast(sizeof(T) == 1 ? ((1u << shift) - 1u) : 0); const __m256i mask = _mm256_set1_epi8(mask8); return _mm256_and_si256(shifted, mask); } XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_srli_epi16(self, shift); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srli_epi32(self, shift); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_srli_epi64(self, shift); } } } template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srav_epi32(self, other); } else { return bitwise_rshift(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srlv_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_srlv_epi64(self, other); } else { return bitwise_rshift(self, other, avx {}); } } } // bitwise_xor template ::value>> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_xor_si256(self, other); } template ::value>> XSIMD_INLINE batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_si256(self, other); } // complex_low template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0)); __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0)); return _mm256_blend_pd(tmp0, tmp1, 10); } // complex_high template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2)); __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0)); return _mm256_blend_pd(tmp0, tmp1, 10); } // fast_cast namespace detail { template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to avx __m256i xH = _mm256_srli_epi64(x, 32); xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84 __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)"); return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); } template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to avx __m256i xH = _mm256_srai_epi32(x, 16); xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); // 3*2^67 __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)"); return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); } } // eq template ::value>> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_cmpeq_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_cmpeq_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_cmpeq_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_cmpeq_epi64(self, other); } else { return eq(self, other, avx {}); } } // gather template = 0, detail::enable_sized_integral_t = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i32gather_epi32(reinterpret_cast(src), index, sizeof(T)); } template = 0, detail::enable_sized_integral_t = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i64gather_epi64(reinterpret_cast(src), index, sizeof(T)); } template = 0> XSIMD_INLINE batch gather(batch const&, float const* src, batch const& index, kernel::requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i32gather_ps(src, index, sizeof(float)); } template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i64gather_pd(src, index, sizeof(double)); } // gather: handmade conversions template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); const batch high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data)); } template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); const batch high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data)); } // lt template ::value>> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_cmpgt_epi8(other, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_cmpgt_epi16(other, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_cmpgt_epi32(other, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_cmpgt_epi64(other, self); } else { return lt(self, other, avx {}); } } else { return lt(self, other, avx {}); } } // load_complex template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; batch_type real = _mm256_castpd_ps( _mm256_permute4x64_pd( _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))), _MM_SHUFFLE(3, 1, 2, 0))); batch_type imag = _mm256_castpd_ps( _mm256_permute4x64_pd( _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))), _MM_SHUFFLE(3, 1, 2, 0))); return { real, imag }; } template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); return { real, imag }; } // load_unaligned template ::value>> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return { _mm256_sub_epi8(_mm256_set1_epi8(0), _mm256_loadu_si256((__m256i const*)mem)) }; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto bpack = _mm_loadu_si128((__m128i const*)mem); return { _mm256_sub_epi16(_mm256_set1_epi8(0), _mm256_cvtepu8_epi16(bpack)) }; } // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this. // GCC/Clang/MSVC will turn it into the correct load. else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { #if defined(__x86_64__) uint64_t tmp; memcpy(&tmp, mem, sizeof(tmp)); auto val = _mm_cvtsi64_si128(tmp); #else __m128i val; memcpy(&val, mem, sizeof(uint64_t)); #endif return { _mm256_sub_epi32(_mm256_set1_epi8(0), _mm256_cvtepu8_epi32(val)) }; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { uint32_t tmp; memcpy(&tmp, mem, sizeof(tmp)); return { _mm256_sub_epi64(_mm256_set1_epi8(0), _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(tmp))) }; } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept { return { _mm256_castsi256_ps(load_unaligned(mem, batch_bool {}, r).data) }; } template XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept { return { _mm256_castsi256_pd(load_unaligned(mem, batch_bool {}, r).data) }; } // mask template ::value>> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self); return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12); } else { return mask(self, avx {}); } } // max template ::value>> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_max_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_max_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_max_epi32(self, other); } else { return max(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_max_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_max_epu16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_max_epu32(self, other); } else { return max(self, other, avx {}); } } } // min template ::value>> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_min_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_min_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_min_epi32(self, other); } else { return min(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_min_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_min_epu16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_min_epu32(self, other); } else { return min(self, other, avx {}); } } } // mul template ::value>> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00); __m256i res_lo = _mm256_mullo_epi16(self, other); __m256i other_hi = _mm256_srli_epi16(other, 8); __m256i self_hi = _mm256_and_si256(self, mask_hi); __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi); __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi); return res; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_mullo_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_mullo_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_add_epi64( _mm256_mul_epu32(self, other), _mm256_slli_epi64( _mm256_add_epi64( _mm256_mul_epu32(other, _mm256_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))), _mm256_mul_epu32(self, _mm256_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))), 32)); } else { return mul(self, other, avx {}); } } // reduce_add template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { __m256i tmp1 = _mm256_hadd_epi32(self, self); __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3); return _mm_cvtsi128_si32(tmp4); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E); __m256i tmp2 = _mm256_add_epi64(self, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3); #if defined(__x86_64__) return _mm_cvtsi128_si64(res); #else __m128i m; _mm_storel_epi64(&m, res); int64_t i; std::memcpy(&i, &m, sizeof(i)); return i; #endif } else { return reduce_add(self, avx {}); } } // rotate_left template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { auto other = _mm256_permute2x128_si256(self, self, 0x1); if (N < 16) { return _mm256_alignr_epi8(other, self, N); } else { return _mm256_alignr_epi8(self, other, N - 16); } } template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return bitwise_cast(rotate_left(bitwise_cast(self), avx2 {})); } template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { auto other = _mm256_permute2x128_si256(self, self, 0x1); if (N < 8) { return _mm256_alignr_epi8(other, self, 2 * N); } else { return _mm256_alignr_epi8(self, other, 2 * (N - 8)); } } template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return bitwise_cast(rotate_left(bitwise_cast(self), avx2 {})); } // sadd template ::value>> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_adds_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_adds_epi16(self, other); } else { return sadd(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_adds_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_adds_epu16(self, other); } else { return sadd(self, other, avx {}); } } } // select template ::value>> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_blendv_epi8(false_br, true_br, cond); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_blendv_epi8(false_br, true_br, cond); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_blendv_epi8(false_br, true_br, cond); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_blendv_epi8(false_br, true_br, cond); } else { return select(cond, true_br, false_br, avx {}); } } template ::value>> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { // FIXME: for some reason mask here is not considered as an immediate, // but it's okay for _mm256_blend_epi32 // case 2: return _mm256_blend_epi16(false_br, true_br, mask); XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { constexpr int mask = batch_bool_constant::mask(); return _mm256_blend_epi32(false_br, true_br, mask); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { constexpr int mask = batch_bool_constant::mask(); constexpr int imask = detail::interleave(mask); return _mm256_blend_epi32(false_br, true_br, imask); } else { return select(batch_bool { Values... }, true_br, false_br, avx2 {}); } } // slide_left template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; auto y = _mm256_bslli_epi128(x, M); return _mm256_permute2x128_si256(y, y, 0x28); } if (BitCount == 128) { return _mm256_permute2x128_si256(x, x, 0x28); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; auto y = _mm256_bslli_epi128(x, M); auto z = _mm256_bsrli_epi128(x, 16 - M); auto w = _mm256_permute2x128_si256(z, z, 0x28); return _mm256_or_si256(y, w); } // slide_right template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; auto y = _mm256_bsrli_epi128(x, M); return _mm256_permute2x128_si256(y, y, 0x81); } if (BitCount == 128) { return _mm256_permute2x128_si256(x, x, 0x81); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; auto y = _mm256_bsrli_epi128(x, M); auto z = _mm256_bslli_epi128(x, 16 - M); auto w = _mm256_permute2x128_si256(z, z, 0x81); return _mm256_or_si256(y, w); } // store namespace detail { template XSIMD_INLINE void store_bool_avx2(__m256i b, bool* mem, T) noexcept { // GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this. // GCC/Clang/MSVC will turn it into the correct store. XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { // negate mask to convert to 0 or 1 auto val = _mm256_sub_epi8(_mm256_set1_epi8(0), b); memcpy(mem, &val, sizeof(val)); return; } auto b_hi = _mm256_extractf128_si256(b, 1); auto b_lo = _mm256_castsi256_si128(b); XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b_lo, b_hi)); memcpy(mem, &val, sizeof(val)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto pack_16 = _mm_packs_epi32(b_lo, b_hi); auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16)); #if defined(__x86_64__) auto val_lo = _mm_cvtsi128_si64(val); memcpy(mem, &val_lo, sizeof(val_lo)); #else memcpy(mem, &val, sizeof(uint64_t)); #endif } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { uint32_t mask = _mm256_movemask_epi8(_mm256_srli_epi64(b, 56)); memcpy(mem, &mask, sizeof(mask)); } else { assert(false && "unsupported arch/op combination"); } } XSIMD_INLINE __m256i avx_to_i(__m256 x) { return _mm256_castps_si256(x); } XSIMD_INLINE __m256i avx_to_i(__m256d x) { return _mm256_castpd_si256(x); } XSIMD_INLINE __m256i avx_to_i(__m256i x) { return x; } } template XSIMD_INLINE void store(batch_bool b, bool* mem, requires_arch) noexcept { detail::store_bool_avx2(detail::avx_to_i(b), mem, T {}); } // ssub template ::value>> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_subs_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_subs_epi16(self, other); } else { return ssub(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_subs_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_subs_epu16(self, other); } else { return ssub(self, other, avx {}); } } } // sub template ::value>> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_sub_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_sub_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_sub_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_sub_epi64(self, other); } else { return sub(self, other, avx {}); } } // swizzle (dynamic mask) on 8 and 16 bits; see avx for 32 and 64 bits versions template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { // swap lanes __m256i swapped = _mm256_permute2x128_si256(self, self, 0x01); // [high | low] // normalize mask taking modulo 16 batch half_mask = mask & 0b1111u; // permute bytes within each lane (AVX2 only) __m256i r0 = _mm256_shuffle_epi8(self, half_mask); __m256i r1 = _mm256_shuffle_epi8(swapped, half_mask); // select lane by the mask index divided by 16, first lane is 0, second is 16. constexpr auto lane_size = make_batch_constant(); constexpr auto lane = (make_iota_batch_constant() / lane_size) * lane_size; batch_bool blend_mask = (mask & 0b10000u) != lane; return _mm256_blendv_epi8(r0, r1, blend_mask); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); } template XSIMD_INLINE batch swizzle( batch const& self, batch mask, requires_arch req) noexcept { // No blend/shuffle for 16 bits, we need to use the 8 bits version const auto self_bytes = bitwise_cast(self); // If a mask entry is k, we want 2k in low byte and 2k+1 in high byte const auto mask_2k_2kp1 = bitwise_cast((mask << 1) | (mask << 9) | 0x100); return bitwise_cast(swizzle(self_bytes, mask_2k_2kp1, req)); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch req) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); } namespace detail { template struct swizzle_mask { static constexpr auto values = std::array { Vals... }; static constexpr T get(std::size_t idx_, std::size_t size_) noexcept { const T size = static_cast(size_); const T idx = static_cast(idx_); const T val = values[idx_]; // Check if value in bounds if ((T(0) <= val) && (val < size)) { // Whether we need to access the value from the other lane const bool val_is_cross_lane = (idx < (size / 2)) != (val < (size / 2)); if (val_is_cross_lane == cross_batch) { return val % (size / 2); } } // Out of bounds with most significant bit set to 1 will set the swizzle target to 0 return ~T {}; } }; } // swizzle (constant mask) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { static_assert(sizeof...(Vals) == 32, "Must contain as many uint8_t as can fit in avx register"); XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; } constexpr auto lane_mask = mask % std::integral_constant(); XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask)) { return _mm256_shuffle_epi8(self, lane_mask.as_batch()); } XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask)) { __m256i broadcast = _mm256_permute2x128_si256(self, self, 0x00); // [low | low] return _mm256_shuffle_epi8(broadcast, lane_mask.as_batch()); } XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask)) { __m256i broadcast = _mm256_permute2x128_si256(self, self, 0x11); // [high | high] return _mm256_shuffle_epi8(broadcast, lane_mask.as_batch()); } // swap lanes __m256i swapped = _mm256_permute2x128_si256(self, self, 0x01); // [high | low] // We can outsmart the dynamic version by creating a compile-time mask that leaves zeros // where it does not need to select data, resulting in a simple OR merge of the two batches. constexpr auto self_mask = make_batch_constant, A>(); constexpr auto cross_mask = make_batch_constant, A>(); // permute bytes within each lane (AVX2 only) __m256i r0 = _mm256_shuffle_epi8(self, self_mask.as_batch()); __m256i r1 = _mm256_shuffle_epi8(swapped, cross_mask.as_batch()); return _mm256_or_si256(r0, r1); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch req) noexcept { static_assert(sizeof...(Vals) == 32, "Must contain as many uint8_t as can fit in avx register"); return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); } template < class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7, uint16_t V8, uint16_t V9, uint16_t V10, uint16_t V11, uint16_t V12, uint16_t V13, uint16_t V14, uint16_t V15> XSIMD_INLINE batch swizzle( batch const& self, batch_constant, requires_arch req) noexcept { const auto self_bytes = bitwise_cast(self); // If a mask entry is k, we want 2k in low byte and 2k+1 in high byte auto constexpr mask_2k_2kp1 = batch_constant< uint8_t, A, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1, 2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1, 2 * V8, 2 * V8 + 1, 2 * V9, 2 * V9 + 1, 2 * V10, 2 * V10 + 1, 2 * V11, 2 * V11 + 1, 2 * V12, 2 * V12 + 1, 2 * V13, 2 * V13 + 1, 2 * V14, 2 * V14 + 1, 2 * V15, 2 * V15 + 1> {}; return bitwise_cast(swizzle(self_bytes, mask_2k_2kp1, req)); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch req) noexcept { static_assert(sizeof...(Vals) == 16, "Must contain as many uint16_t as can fit in avx register"); return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; } XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask)) { constexpr auto lane_mask = mask % std::integral_constant(); // Cheaper intrinsics when not crossing lanes // Contrary to the uint64_t version, the limits of 8 bits for the immediate constant // cannot make different permutations across lanes batch permuted = _mm256_permutevar_ps(bitwise_cast(self), lane_mask.as_batch()); return bitwise_cast(permuted); } return _mm256_permutevar8x32_epi32(self, mask.as_batch()); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch req) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; } XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask)) { constexpr uint8_t lane_mask = (V0 % 2) | ((V1 % 2) << 1) | ((V2 % 2) << 2) | ((V3 % 2) << 3); // Cheaper intrinsics when not crossing lanes batch permuted = _mm256_permute_pd(bitwise_cast(self), lane_mask); return bitwise_cast(permuted); } constexpr auto mask_int = detail::mod_shuffle(V0, V1, V2, V3); return _mm256_permute4x64_epi64(self, mask_int); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch req) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, req)); } // zip_hi template ::value>> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto lo = _mm256_unpacklo_epi8(self, other); auto hi = _mm256_unpackhi_epi8(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto lo = _mm256_unpacklo_epi16(self, other); auto hi = _mm256_unpackhi_epi16(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_epi32(self, other); auto hi = _mm256_unpackhi_epi32(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_epi64(self, other); auto hi = _mm256_unpackhi_epi64(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else { assert(false && "unsupported arch/op combination"); return {}; } } // zip_lo template ::value>> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto lo = _mm256_unpacklo_epi8(self, other); auto hi = _mm256_unpackhi_epi8(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto lo = _mm256_unpacklo_epi16(self, other); auto hi = _mm256_unpackhi_epi16(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_epi32(self, other); auto hi = _mm256_unpackhi_epi32(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_epi64(self, other); auto hi = _mm256_unpackhi_epi64(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else { assert(false && "unsupported arch/op combination"); return {}; } } // widen template ::value>> XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { __m128i x_lo = detail::lower_half(x); __m128i x_hi = detail::upper_half(x); __m256i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { XSIMD_IF_CONSTEXPR(std::is_signed::value) { lo = _mm256_cvtepi32_epi64(x_lo); hi = _mm256_cvtepi32_epi64(x_hi); } else { lo = _mm256_cvtepu32_epi64(x_lo); hi = _mm256_cvtepu32_epi64(x_hi); } } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { XSIMD_IF_CONSTEXPR(std::is_signed::value) { lo = _mm256_cvtepi16_epi32(x_lo); hi = _mm256_cvtepi16_epi32(x_hi); } else { lo = _mm256_cvtepu16_epi32(x_lo); hi = _mm256_cvtepu16_epi32(x_hi); } } else XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { XSIMD_IF_CONSTEXPR(std::is_signed::value) { lo = _mm256_cvtepi8_epi16(x_lo); hi = _mm256_cvtepi8_epi16(x_hi); } else { lo = _mm256_cvtepu8_epi16(x_lo); hi = _mm256_cvtepu8_epi16(x_hi); } } return { lo, hi }; } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx512bw.hpp000066400000000000000000000673071517435117100247240ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512BW_HPP #define XSIMD_AVX512BW_HPP #include #include #include "../types/xsimd_avx512bw_register.hpp" namespace xsimd { namespace kernel { using namespace types; namespace detail { template XSIMD_INLINE batch_bool compare_int_avx512bw(batch const& self, batch const& other) noexcept { using register_type = typename batch_bool::register_type; if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); } } } } // abs template ::value>> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { if (std::is_unsigned::value) { return self; } XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_abs_epi8(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_abs_epi16(self); } else { return abs(self, avx512dq {}); } } // add template ::value>> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_add_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_add_epi16(self, other); } else { return add(self, other, avx512dq {}); } } // avgr template ::value>> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_avg_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_avg_epu16(self, other); } else { return avgr(self, other, common {}); } } // avg template ::value>> XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto adj = ((self ^ other) << 7) >> 7; return avgr(self, other, A {}) - adj; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto adj = ((self ^ other) << 15) >> 15; return avgr(self, other, A {}) - adj; } else { return avg(self, other, common {}); } } // bitwise_lshift template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_sllv_epi16(self, _mm512_set1_epi16(other)); #else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_slli_epi16(self, other); #endif } else { return bitwise_lshift(self, other, avx512dq {}); } } // bitwise_rshift template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF); __m512i zeros = _mm512_setzero_si512(); __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self); __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other)); #else __m512i res = _mm512_srai_epi16(self, other); #endif return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res)); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srav_epi16(self, _mm512_set1_epi16(other)); #else } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srai_epi16(self, other); #endif } else { return bitwise_rshift(self, other, avx512dq {}); } } else { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srlv_epi16(self, _mm512_set1_epi16(other)); #else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srli_epi16(self, other); #endif } else { return bitwise_rshift(self, other, avx512dq {}); } } } // decr_if template ::value>> XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_mask_sub_epi8(self, mask.data, self, _mm512_set1_epi8(1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_mask_sub_epi16(self, mask.data, self, _mm512_set1_epi16(1)); } else { return decr_if(self, mask, avx512dq {}); } } // eq template ::value>> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // ge template ::value>> XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // gt template ::value>> XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // incr_if template ::value>> XSIMD_INLINE batch incr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_mask_add_epi8(self, mask.data, self, _mm512_set1_epi8(1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_mask_add_epi16(self, mask.data, self, _mm512_set1_epi16(1)); } else { return incr_if(self, mask, avx512dq {}); } } // insert template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_mask_set1_epi8(self, __mmask64(1ULL << (I & 63)), val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_mask_set1_epi16(self, __mmask32(1 << (I & 31)), val); } else { return insert(self, val, pos, avx512dq {}); } } // le template ::value>> XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // lt template ::value>> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // load template ::value>> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { using mask_type = typename batch_bool::register_type; XSIMD_IF_CONSTEXPR(batch_bool::size == 64) { __m512i bool_val = _mm512_loadu_si512((__m512i const*)mem); return (mask_type)_mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512()); } else XSIMD_IF_CONSTEXPR(batch_bool::size == 32) { __m256i bpack = _mm256_loadu_si256((__m256i const*)mem); return (mask_type)_mm512_cmpgt_epu16_mask(_mm512_cvtepu8_epi16(bpack), _mm512_setzero_si512()); } else XSIMD_IF_CONSTEXPR(batch_bool::size == 16) { __m128i bpack = _mm_loadu_si128((__m128i const*)mem); return (mask_type)_mm512_cmpgt_epu32_mask(_mm512_cvtepu8_epi32(bpack), _mm512_setzero_si512()); } else XSIMD_IF_CONSTEXPR(batch_bool::size == 8) { __m128i bpack = _mm_loadl_epi64((__m128i const*)mem); return (mask_type)_mm512_cmpgt_epu64_mask(_mm512_cvtepu8_epi64(bpack), _mm512_setzero_si512()); } else { assert(false && "unexpected batch size"); return {}; } } template ::value>> XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool, requires_arch) noexcept { using mask_type = typename batch_bool::register_type; XSIMD_IF_CONSTEXPR(batch_bool::size == 64) { __m512i bool_val = _mm512_load_si512((__m512i const*)mem); return (mask_type)_mm512_cmpgt_epu8_mask(bool_val, _mm512_setzero_si512()); } else XSIMD_IF_CONSTEXPR(batch_bool::size == 32) { __m256i bpack = _mm256_load_si256((__m256i const*)mem); return (mask_type)_mm512_cmpgt_epu16_mask(_mm512_cvtepu8_epi16(bpack), _mm512_setzero_si512()); } else XSIMD_IF_CONSTEXPR(batch_bool::size == 16) { __m128i bpack = _mm_load_si128((__m128i const*)mem); return (mask_type)_mm512_cmpgt_epu32_mask(_mm512_cvtepu8_epi32(bpack), _mm512_setzero_si512()); } else XSIMD_IF_CONSTEXPR(batch_bool::size == 8) { __m128i bpack = _mm_loadl_epi64((__m128i const*)mem); return (mask_type)_mm512_cmpgt_epu64_mask(_mm512_cvtepu8_epi64(bpack), _mm512_setzero_si512()); } else { assert(false && "unexpected batch size"); return {}; } } // max template ::value>> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_max_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_max_epi16(self, other); } else { return max(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_max_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_max_epu16(self, other); } else { return max(self, other, avx512dq {}); } } } // min template ::value>> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_min_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_min_epi16(self, other); } else { return min(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_min_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_min_epu16(self, other); } else { return min(self, other, avx512dq {}); } } } // mul template ::value>> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8)); __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8); return _mm512_or_si512(upper, lower); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_mullo_epi16(self, other); } else { return mul(self, other, avx512dq {}); } } // neq template ::value>> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // sadd template ::value>> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_adds_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_adds_epi16(self, other); } else { return sadd(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_adds_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_adds_epu16(self, other); } else { return sadd(self, other, avx512dq {}); } } } // select template ::value>> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data); } else { return select(cond, true_br, false_br, avx512dq {}); } } // slide_left template > XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { static_assert((N & 3) == 2 && N < 64, "The AVX512F implementation may have a lower latency."); __mmask32 mask = 0xFFFFFFFFu << ((N / 2) & 31); auto slide_pattern = make_batch_constant, A>(); return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), x); } // slide_right template > XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { static_assert((N & 3) == 2 && N < 64, "The AVX512F implementation may have a lower latency."); __mmask32 mask = 0xFFFFFFFFu >> ((N / 2) & 31); auto slide_pattern = make_batch_constant, A>(); return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), x); } // ssub template ::value>> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_subs_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_subs_epi16(self, other); } else { return ssub(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_subs_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_subs_epu16(self, other); } else { return ssub(self, other, avx512dq {}); } } } // store template XSIMD_INLINE void store(batch_bool const& self, bool* mem, requires_arch) noexcept { constexpr auto size = batch_bool::size; __m512i bool_val = _mm512_maskz_set1_epi8(self.data, 0x01); __mmask64 mask = size >= 64 ? ~(__mmask64)0 : (1ULL << size) - 1; _mm512_mask_storeu_epi8((void*)mem, mask, bool_val); } // sub template ::value>> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_sub_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_sub_epi16(self, other); } else { return sub(self, other, avx512dq {}); } } // swizzle (dynamic version) template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_epi16(mask, self); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512bw {})); } // swizzle (static version) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512bw {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512bw {}); } // widen template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { __m256i x_lo = _mm512_extracti64x4_epi64(x, 0); __m256i x_hi = _mm512_extracti64x4_epi64(x, 1); __m512i lo = _mm512_cvtepu8_epi16(x_lo); __m512i hi = _mm512_cvtepu8_epi16(x_hi); return { lo, hi }; } template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { __m256i x_lo = _mm512_extracti64x4_epi64(x, 0); __m256i x_hi = _mm512_extracti64x4_epi64(x, 1); __m512i lo = _mm512_cvtepi8_epi16(x_lo); __m512i hi = _mm512_cvtepi8_epi16(x_hi); return { lo, hi }; } // zip_hi template ::value>> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { lo = _mm512_unpacklo_epi8(self, other); hi = _mm512_unpackhi_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { lo = _mm512_unpacklo_epi16(self, other); hi = _mm512_unpackhi_epi16(self, other); } else { return zip_hi(self, other, avx512f {}); } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0), _mm512_extracti32x4_epi32(lo, 3), 2), _mm512_extracti32x4_epi32(hi, 2), 1); } // zip_lo template ::value>> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { lo = _mm512_unpacklo_epi8(self, other); hi = _mm512_unpackhi_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { lo = _mm512_unpacklo_epi16(self, other); hi = _mm512_unpackhi_epi16(self, other); } else { return zip_lo(self, other, avx512f {}); } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1), _mm512_extracti32x4_epi32(hi, 1), 3), _mm512_extracti32x4_epi32(lo, 1), 2); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx512cd.hpp000066400000000000000000000017231517435117100246700ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512CD_HPP #define XSIMD_AVX512CD_HPP #include "../types/xsimd_avx512cd_register.hpp" namespace xsimd { namespace kernel { // Nothing there yet. } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx512dq.hpp000066400000000000000000000306061517435117100247100ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512DQ_HPP #define XSIMD_AVX512DQ_HPP #include "../types/xsimd_avx512dq_register.hpp" namespace xsimd { namespace kernel { using namespace types; // load_masked template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(mask.countr_zero() >= 8) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); const auto hi = load_masked(mem + 8, mhi, convert {}, Mode {}, avx2 {}); return _mm512_inserti32x8(_mm512_setzero_si512(), hi, 1); } return load_masked(mem, mask, convert {}, Mode {}, avx512f {}); } template XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(mask.countr_zero() >= 8) { constexpr auto mhi = ::xsimd::detail::upper_half(mask); const auto hi = load_masked(mem + 8, mhi, convert {}, Mode {}, avx2 {}); return _mm512_insertf32x8(_mm512_setzero_ps(), hi, 1); } return load_masked(mem, mask, convert {}, Mode {}, avx512f {}); } // bitwise_and template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_and_ps(self, other); } template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_and_pd(self, other); } // bitwise_andnot template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_andnot_ps(other, self); } template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_andnot_pd(other, self); } // bitwise_not template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1))); } template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1))); } // bitwise_or template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_or_ps(self, other); } template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_or_pd(self, other); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data | other.data); } // bitwise_xor template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_xor_ps(self, other); } template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_xor_pd(self, other); } // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { // The following folds over the vector once: // tmp1 = [a0..8, b0..8] // tmp2 = [a8..f, b8..f] #define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ batch res##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ res##I = _mm512_add_ps(tmp1, tmp2); \ } XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); #undef XSIMD_AVX512_HADDP_STEP1 // The following flds the code and shuffles so that hadd_ps produces the correct result // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ batch halfx##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx1 = _mm512_add_ps(tmp1, tmp2); \ \ auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx2 = _mm512_add_ps(tmp3, tmp4); \ \ auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx3 = _mm512_add_ps(tmp5, tmp6); \ \ halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \ _mm512_extractf32x8_ps(resx3, 1)); \ } XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); #undef XSIMD_AVX512_HADDP_STEP2 auto concat = _mm512_castps256_ps512(halfx0); concat = _mm512_insertf32x8(concat, halfx1, 1); return concat; } // ldexp template XSIMD_INLINE batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other)); } // mul template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mullo_epi64(self, other); } template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mullo_epi64(self, other); } // nearbyint_as_int template XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return _mm512_cvtpd_epi64(self); } // reduce_add template XSIMD_INLINE float reduce_add(batch const& rhs, requires_arch) noexcept { __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); __m256 res1 = _mm256_add_ps(tmp1, tmp2); return reduce_add(batch(res1), avx2 {}); } // reduce_mul template XSIMD_INLINE float reduce_mul(batch const& rhs, requires_arch) noexcept { __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); __m256 res1 = _mm256_mul_ps(tmp1, tmp2); return reduce_mul(batch(res1), avx2 {}); } // swizzle constant mask template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { constexpr bool dup_lo = detail::is_dup_lo(mask); constexpr bool dup_hi = detail::is_dup_hi(mask); XSIMD_IF_CONSTEXPR(dup_lo || dup_hi) { const batch half = _mm512_extractf32x8_ps(self, dup_lo ? 0 : 1); constexpr std::conditional_t, batch_constant> half_mask {}; auto permuted = swizzle(half, half_mask, avx2 {}); // merge the two slices into an AVX512F register: return _mm512_broadcast_f32x8(permuted); // duplicates the 256-bit perm into both halves } return swizzle(self, mask, avx512f {}); } // convert namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvtepi64_pd(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvttpd_epi64(self); } } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx512er.hpp000066400000000000000000000015741517435117100247140ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512ER_HPP #define XSIMD_AVX512ER_HPP #include "../types/xsimd_avx512er_register.hpp" #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx512f.hpp000066400000000000000000004231351517435117100245340ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512F_HPP #define XSIMD_AVX512F_HPP #include #include #include #include "../types/xsimd_avx512f_register.hpp" #include "../types/xsimd_batch_constant.hpp" namespace xsimd { namespace kernel { using namespace types; // fwd template XSIMD_INLINE batch decr_if(batch const& self, Mask const& mask, requires_arch) noexcept; template XSIMD_INLINE batch incr_if(batch const& self, Mask const& mask, requires_arch) noexcept; template XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept; template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept; template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; namespace detail { XSIMD_INLINE __m256 lower_half(__m512 self) noexcept { return _mm512_castps512_ps256(self); } XSIMD_INLINE __m256d lower_half(__m512d self) noexcept { return _mm512_castpd512_pd256(self); } XSIMD_INLINE __m256i lower_half(__m512i self) noexcept { return _mm512_castsi512_si256(self); } XSIMD_INLINE __m256 upper_half(__m512 self) noexcept { return _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(self), 1)); } XSIMD_INLINE __m256d upper_half(__m512d self) noexcept { return _mm512_extractf64x4_pd(self, 1); } XSIMD_INLINE __m256i upper_half(__m512i self) noexcept { return _mm512_extracti64x4_epi64(self, 1); } XSIMD_INLINE __m512i merge_avx(__m256i low, __m256i high) noexcept { return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1); } XSIMD_INLINE __m512 merge_avx(__m256 low, __m256 high) noexcept { return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_castps_pd(low)), _mm256_castps_pd(high), 1)); } XSIMD_INLINE __m512d merge_avx(__m256d low, __m256d high) noexcept { return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1); } template __m512i fwd_to_avx(F f, __m512i self) { __m256i self_low = lower_half(self), self_high = upper_half(self); __m256i res_low = f(self_low); __m256i res_high = f(self_high); return merge_avx(res_low, res_high); } template __m512i fwd_to_avx(F f, __m512i self, __m512i other) { __m256i self_low = lower_half(self), self_high = upper_half(self), other_low = lower_half(other), other_high = upper_half(other); __m256i res_low = f(self_low, other_low); __m256i res_high = f(self_high, other_high); return merge_avx(res_low, res_high); } template __m512i fwd_to_avx(F f, __m512i self, int32_t other) { __m256i self_low = lower_half(self), self_high = upper_half(self); __m256i res_low = f(self_low, other); __m256i res_high = f(self_high, other); return merge_avx(res_low, res_high); } } namespace detail { XSIMD_INLINE uint32_t morton(uint16_t x, uint16_t y) noexcept { static const unsigned short MortonTable256[256] = { 0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015, 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055, 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115, 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155, 0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415, 0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455, 0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515, 0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555, 0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015, 0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055, 0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115, 0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155, 0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415, 0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455, 0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515, 0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555, 0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015, 0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055, 0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115, 0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155, 0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415, 0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455, 0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515, 0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555, 0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015, 0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055, 0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115, 0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155, 0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415, 0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455, 0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515, 0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555 }; uint32_t z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF]; return z; } template XSIMD_INLINE batch_bool compare_int_avx512f(batch const& self, batch const& other) noexcept { using register_type = typename batch_bool::register_type; if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { // shifting to take sign into account uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, (batch(other.data) & batch(0x000000FF)) << 24, Cmp); uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, (batch(other.data) & batch(0x0000FF00)) << 16, Cmp); uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, (batch(other.data) & batch(0x00FF0000)) << 8, Cmp); uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); uint64_t mask = 0; for (unsigned i = 0; i < 16; ++i) { mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); } return (register_type)mask; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { // shifting to take sign into account uint16_t mask_low = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, (batch(other.data) & batch(0x0000FFFF)) << 16, Cmp); uint16_t mask_high = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); return static_cast(morton(mask_low, mask_high)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); uint64_t mask = 0; for (unsigned i = 0; i < 16; ++i) { mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); } return (register_type)mask; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { uint16_t mask_low = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); uint16_t mask_high = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); return static_cast(morton(mask_low, mask_high)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); } } } } namespace detail { // --- compact unified helpers for masked loads/stores & zero-extension --- struct high_tag { }; // zero-extend 256 -> 512 (low) and place-into-high variants XSIMD_INLINE __m512i zero_extend(__m256i lo) noexcept { return _mm512_zextsi256_si512(lo); } XSIMD_INLINE __m512 zero_extend(__m256 lo) noexcept { return _mm512_zextps256_ps512(lo); } XSIMD_INLINE __m512d zero_extend(__m256d lo) noexcept { return _mm512_zextpd256_pd512(lo); } XSIMD_INLINE __m512i zero_extend(__m256i hi, high_tag) noexcept { return _mm512_inserti64x4(_mm512_setzero_si512(), hi, 1); } XSIMD_INLINE __m512 zero_extend(__m256 hi, high_tag) noexcept { return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(_mm512_setzero_ps()), _mm256_castps_pd(hi), 1)); } XSIMD_INLINE __m512d zero_extend(__m256d hi, high_tag) noexcept { return _mm512_insertf64x4(_mm512_setzero_pd(), hi, 1); } // pointer-level masked loads (overloads by pointer/value size and alignment) XSIMD_INLINE __m512 load_masked(const float* mem, uint64_t m, aligned_mode) noexcept { return _mm512_maskz_load_ps((__mmask16)m, mem); } XSIMD_INLINE __m512 load_masked(const float* mem, uint64_t m, unaligned_mode) noexcept { return _mm512_maskz_loadu_ps((__mmask16)m, mem); } XSIMD_INLINE __m512d load_masked(const double* mem, uint64_t m, aligned_mode) noexcept { return _mm512_maskz_load_pd((__mmask8)m, mem); } XSIMD_INLINE __m512d load_masked(const double* mem, uint64_t m, unaligned_mode) noexcept { return _mm512_maskz_loadu_pd((__mmask8)m, mem); } XSIMD_INLINE __m512i load_masked(const int32_t* mem, uint64_t m, aligned_mode) noexcept { return _mm512_maskz_load_epi32((__mmask16)m, mem); } XSIMD_INLINE __m512i load_masked(const int32_t* mem, uint64_t m, unaligned_mode) noexcept { return _mm512_maskz_loadu_epi32((__mmask16)m, mem); } XSIMD_INLINE __m512i load_masked(const uint32_t* mem, uint64_t m, aligned_mode) noexcept { return _mm512_maskz_load_epi32((__mmask16)m, reinterpret_cast(mem)); } XSIMD_INLINE __m512i load_masked(const uint32_t* mem, uint64_t m, unaligned_mode) noexcept { return _mm512_maskz_loadu_epi32((__mmask16)m, reinterpret_cast(mem)); } XSIMD_INLINE __m512i load_masked(const int64_t* mem, uint64_t m, aligned_mode) noexcept { return _mm512_maskz_load_epi64((__mmask8)m, mem); } XSIMD_INLINE __m512i load_masked(const int64_t* mem, uint64_t m, unaligned_mode) noexcept { return _mm512_maskz_loadu_epi64((__mmask8)m, mem); } XSIMD_INLINE __m512i load_masked(const uint64_t* mem, uint64_t m, aligned_mode) noexcept { return _mm512_maskz_load_epi64((__mmask8)m, reinterpret_cast(mem)); } XSIMD_INLINE __m512i load_masked(const uint64_t* mem, uint64_t m, unaligned_mode) noexcept { return _mm512_maskz_loadu_epi64((__mmask8)m, reinterpret_cast(mem)); } // Register-level AVX2-forward helpers: accept 256-bit halves and return 512-bit XSIMD_INLINE __m512 load_masked(__m256 lo) noexcept { return zero_extend(lo); } XSIMD_INLINE __m512d load_masked(__m256d lo) noexcept { return zero_extend(lo); } XSIMD_INLINE __m512i load_masked(__m256i lo) noexcept { return zero_extend(lo); } XSIMD_INLINE __m512 load_masked(__m256 hi, high_tag) noexcept { return zero_extend(hi, high_tag {}); } XSIMD_INLINE __m512d load_masked(__m256d hi, high_tag) noexcept { return zero_extend(hi, high_tag {}); } XSIMD_INLINE __m512i load_masked(__m256i hi, high_tag) noexcept { return zero_extend(hi, high_tag {}); } // pointer-level masked stores (overloads by pointer/value size and alignment) XSIMD_INLINE void store_masked(float* mem, __m512 src, uint64_t m, aligned_mode) noexcept { _mm512_mask_store_ps(mem, (__mmask16)m, src); } XSIMD_INLINE void store_masked(float* mem, __m512 src, uint64_t m, unaligned_mode) noexcept { _mm512_mask_storeu_ps(mem, (__mmask16)m, src); } XSIMD_INLINE void store_masked(double* mem, __m512d src, uint64_t m, aligned_mode) noexcept { _mm512_mask_store_pd(mem, (__mmask8)m, src); } XSIMD_INLINE void store_masked(double* mem, __m512d src, uint64_t m, unaligned_mode) noexcept { _mm512_mask_storeu_pd(mem, (__mmask8)m, src); } XSIMD_INLINE void store_masked(int32_t* mem, __m512i src, uint64_t m, aligned_mode) noexcept { _mm512_mask_store_epi32(mem, (__mmask16)m, src); } XSIMD_INLINE void store_masked(int32_t* mem, __m512i src, uint64_t m, unaligned_mode) noexcept { _mm512_mask_storeu_epi32(mem, (__mmask16)m, src); } XSIMD_INLINE void store_masked(uint32_t* mem, __m512i src, uint64_t m, aligned_mode) noexcept { _mm512_mask_store_epi32(reinterpret_cast(mem), (__mmask16)m, src); } XSIMD_INLINE void store_masked(uint32_t* mem, __m512i src, uint64_t m, unaligned_mode) noexcept { _mm512_mask_storeu_epi32(reinterpret_cast(mem), (__mmask16)m, src); } XSIMD_INLINE void store_masked(int64_t* mem, __m512i src, uint64_t m, aligned_mode) noexcept { _mm512_mask_store_epi64(mem, (__mmask8)m, src); } XSIMD_INLINE void store_masked(int64_t* mem, __m512i src, uint64_t m, unaligned_mode) noexcept { _mm512_mask_storeu_epi64(mem, (__mmask8)m, src); } XSIMD_INLINE void store_masked(uint64_t* mem, __m512i src, uint64_t m, aligned_mode) noexcept { _mm512_mask_store_epi64(reinterpret_cast(mem), (__mmask8)m, src); } XSIMD_INLINE void store_masked(uint64_t* mem, __m512i src, uint64_t m, unaligned_mode) noexcept { _mm512_mask_storeu_epi64(reinterpret_cast(mem), (__mmask8)m, src); } } // namespace detail template = 4)>> XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept { constexpr auto half = batch::size / 2; XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding { constexpr auto mlo = ::xsimd::detail::lower_half(mask); const auto lo = load_masked(mem, mlo, convert {}, Mode {}, avx2 {}); return detail::load_masked(lo); // zero-extend low half } else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding { constexpr auto mhi = ::xsimd::detail::upper_half(mask); const auto hi = load_masked(mem + half, mhi, convert {}, Mode {}, avx2 {}); return detail::load_masked(hi, detail::high_tag {}); } else { // fallback to centralized pointer-level helper return detail::load_masked(mem, mask.mask(), Mode {}); } } template = 4)>> XSIMD_INLINE void store_masked(T* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { constexpr auto half = batch::size / 2; XSIMD_IF_CONSTEXPR(mask.countl_zero() >= half) // lower-half AVX2 forwarding { constexpr auto mlo = ::xsimd::detail::lower_half(mask); const auto lo = detail::lower_half(src); store_masked(mem, lo, mlo, Mode {}, avx2 {}); } else XSIMD_IF_CONSTEXPR(mask.countr_zero() >= half) // upper-half AVX2 forwarding { constexpr auto mhi = ::xsimd::detail::upper_half(mask); const auto hi = detail::upper_half(src); store_masked(mem + half, hi, mhi, Mode {}, avx2 {}); } else { // fallback to centralized pointer-level helper detail::store_masked(mem, src, mask.mask(), Mode {}); } } // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { __m512 self_asf = (__m512)self; __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf); __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi); return *reinterpret_cast<__m512*>(&res_asi); } template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { __m512d self_asd = (__m512d)self; __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd); __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), self_asi); return *reinterpret_cast<__m512d*>(&res_asi); } template ::value>> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { if (std::is_unsigned::value) { return self; } XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return detail::fwd_to_avx([](__m256i s) noexcept { return abs(batch(s)); }, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s) noexcept { return abs(batch(s)); }, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_abs_epi32(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_abs_epi64(self); } else { assert(false && "unsupported arch/op combination"); return {}; } } // add template ::value>> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return add(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return add(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_add_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_add_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_add_ps(self, other); } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_add_pd(self, other); } // all template XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return self.data == register_type(-1); } // any template XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return self.data != register_type(0); } // batch_bool_cast template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return self.data; } // bitwise_and template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { #if defined(_MSC_VER) return _mm512_and_ps(self, other); #else return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); #endif } template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); } template ::value>> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_and_si512(self, other); } template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data & other.data); } // bitwise_andnot template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self))); } template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self))); } template ::value>> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_andnot_si512(other, self); } template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data & ~other.data); } // bitwise_lshift template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); #else __m512i tmp = _mm512_slli_epi32(self, other); #endif return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept { return bitwise_lshift(batch(s), o, avx2 {}); }, self, other); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_sllv_epi64(self, _mm512_set1_epi64(other)); #else } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_slli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_slli_epi64(self, other); #endif } else { assert(false && "unsupported arch/op combination"); return {}; } } // bitwise_not template ::value>> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_xor_si512(self, _mm512_set1_epi32(-1)); } template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(~self.data); } template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_set1_epi32(-1))); } template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_set1_epi32(-1))); } // bitwise_or template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); } template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data | other.data); } template ::value>> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_or_si512(self, other); } // bitwise_rshift template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srav_epi32(self, _mm512_set1_epi32(other)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srav_epi64(self, _mm512_set1_epi64(other)); #else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srai_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srai_epi64(self, other); #endif } else { return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept { return bitwise_rshift(batch(s), o, avx2 {}); }, self, other); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); #else __m512i tmp = _mm512_srli_epi32(self, other); #endif return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srlv_epi64(self, _mm512_set1_epi64(other)); #else } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srli_epi64(self, other); #endif } else { return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept { return bitwise_rshift(batch(s), o, avx2 {}); }, self, other); } } } // rotl template ::value>> XSIMD_INLINE batch rotl(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_rolv_epi32(self, other); } XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_rolv_epi64(self, other); } return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return rotl(batch(s), batch(o), avx2 {}); }, self, other); } template ::value>> XSIMD_INLINE batch rotl(batch const& self, int32_t other, requires_arch) noexcept { return rotl(self, batch(other), A {}); } template ::value>> XSIMD_INLINE batch rotl(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(count < bits, "Count must be less than the number of bits in T"); XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_rol_epi32(self, count); } XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_rol_epi64(self, count); } return detail::fwd_to_avx([](__m256i s) noexcept { return rotl(batch(s), avx2 {}); }, self); } // rotr template ::value>> XSIMD_INLINE batch rotr(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) < 4) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return rotr(batch(s), batch(o), avx2 {}); }, self, other); } XSIMD_IF_CONSTEXPR(std::is_unsigned::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_rorv_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_rorv_epi64(self, other); } } return rotr(self, other, common {}); } template ::value>> XSIMD_INLINE batch rotr(batch const& self, int32_t other, requires_arch) noexcept { return rotr(self, batch(other), A {}); } template ::value>> XSIMD_INLINE batch rotr(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(count < bits, "Count must be less than the number of bits in T"); XSIMD_IF_CONSTEXPR(sizeof(T) < 4) { return detail::fwd_to_avx([](__m256i s) noexcept { return rotr(batch(s), avx2 {}); }, self); } XSIMD_IF_CONSTEXPR(std::is_unsigned::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_ror_epi32(self, count); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_ror_epi64(self, count); } } return rotr(self, common {}); } // bitwise_xor template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); } template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); } template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data ^ other.data); } template ::value>> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_xor_si512(self, other); } // bitwise_cast template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castsi512_ps(self); } template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castsi512_pd(self); } template >::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return batch(self.data); } template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castps_pd(self); } template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castps_si512(self); } template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castpd_ps(self); } template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castpd_si512(self); } // broadcast template ::value>> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_set1_epi8(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_set1_epi16(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_set1_epi32(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_set1_epi64(val); } else { assert(false && "unsupported"); return {}; } } template XSIMD_INLINE batch broadcast(float val, requires_arch) noexcept { return _mm512_set1_ps(val); } template batch XSIMD_INLINE broadcast(double val, requires_arch) noexcept { return _mm512_set1_pd(val); } // ceil template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF); } template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF); } // compress template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_ps(mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_pd(mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi32(mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi32(mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi64(mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi64(mask.mask(), self); } // convert namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvtepi32_ps(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvttps_epi32(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvtepu32_ps(self); } template batch fast_cast(batch const& self, batch const&, requires_arch) { return _mm512_cvttps_epu32(self); } } namespace detail { // complex_low template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); return _mm512_permutex2var_ps(self.real(), idx, self.imag()); } template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11); return _mm512_permutex2var_pd(self.real(), idx, self.imag()); } // complex_high template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); return _mm512_permutex2var_ps(self.real(), idx, self.imag()); } template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15); return _mm512_permutex2var_pd(self.real(), idx, self.imag()); } } // incr_if template ::value>> XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_mask_sub_epi32(self, mask.data, self, _mm512_set1_epi32(1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_mask_sub_epi64(self, mask.data, self, _mm512_set1_epi64(1)); } else { return decr_if(self, mask, common {}); } } // div template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_div_ps(self, other); } template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_div_pd(self, other); } // eq template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ); } template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ); } template ::value>> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } template XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(~self.data ^ other.data); } // expand template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_ps(mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_pd(mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi32(mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi32(mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi64(mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi64(mask.mask(), self); } // floor template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF); } template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF); } // fnma template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fnmadd_ps(x, y, z); } template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fnmadd_pd(x, y, z); } // fma template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmadd_ps(x, y, z); } template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmadd_pd(x, y, z); } // fms template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmsub_ps(x, y, z); } template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmsub_pd(x, y, z); } // fmas template XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmaddsub_ps(x, y, z); } template XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmaddsub_pd(x, y, z); } // from bool template XSIMD_INLINE batch from_bool(batch_bool const& self, requires_arch) noexcept { return select(self, batch(1), batch(0)); } // from_mask template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { return static_cast::register_type>(mask); } // gather template = 0, detail::enable_sized_integral_t = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i32gather_epi32(index, static_cast(src), sizeof(T)); } template = 0, detail::enable_sized_integral_t = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i64gather_epi64(index, static_cast(src), sizeof(T)); } template = 0> XSIMD_INLINE batch gather(batch const&, float const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i32gather_ps(index, src, sizeof(float)); } template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i64gather_pd(index, src, sizeof(double)); } // gather: handmade conversions template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double))); const batch high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double))); return detail::merge_avx(_mm512_cvtpd_ps(low.data), _mm512_cvtpd_ps(high.data)); } template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double))); const batch high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double))); return detail::merge_avx(_mm512_cvtpd_epi32(low.data), _mm512_cvtpd_epi32(high.data)); } // ge template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ); } template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ); } template ::value>> XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } // gt template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ); } template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ); } template ::value>> XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { // The following folds over the vector once: // tmp1 = [a0..8, b0..8] // tmp2 = [a8..f, b8..f] #define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ batch res##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ res##I = _mm512_add_ps(tmp1, tmp2); \ } XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); #undef XSIMD_AVX512_HADDP_STEP1 // The following flds the code and shuffles so that hadd_ps produces the correct result // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ batch halfx##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx1 = _mm512_add_ps(tmp1, tmp2); \ \ auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx2 = _mm512_add_ps(tmp3, tmp4); \ \ auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx3 = _mm512_add_ps(tmp5, tmp6); \ \ halfx##I = _mm256_hadd_ps(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 0)), _mm512_extractf32x4_ps(resx3, 1), 1), \ _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 2)), _mm512_extractf32x4_ps(resx3, 3), 1)); \ } XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); #undef XSIMD_AVX512_HADDP_STEP2 auto concat = _mm512_castps256_ps512(halfx0); concat = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(concat), _mm256_castps_pd(halfx1), 1)); return concat; } template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { #define step1(I, a, b) \ batch res##I; \ { \ auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ res##I = _mm512_add_pd(tmp1, tmp2); \ } step1(1, row[0], row[2]); step1(2, row[4], row[6]); step1(3, row[1], row[3]); step1(4, row[5], row[7]); #undef step1 auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0)); auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1)); auto resx1 = _mm512_add_pd(tmp5, tmp6); auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0)); auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1)); auto resx2 = _mm512_add_pd(tmp7, tmp8); auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000); auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111); return _mm512_add_pd(tmpx, tmpy); } // incr_if template ::value>> XSIMD_INLINE batch incr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_mask_add_epi32(self, mask.data, self, _mm512_set1_epi32(1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_mask_add_epi64(self, mask.data, self, _mm512_set1_epi64(1)); } else { return incr_if(self, mask, common {}); } } // first (must precede get for two-phase lookup) template XSIMD_INLINE float first(batch const& self, requires_arch) noexcept { return _mm512_cvtss_f32(self); } template XSIMD_INLINE double first(batch const& self, requires_arch) noexcept { return _mm512_cvtsd_f64(self); } template ::value>> XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFF); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self)) & 0xFFFF); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return static_cast(_mm_cvtsi128_si32(_mm512_castsi512_si128(self))); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { batch low = _mm512_castsi512_si128(self); return first(low, sse4_2 {}); } else { assert(false && "unsupported arch/op combination"); return {}; } } // get: use valignd/valignq to rotate lane I into position 0 in a single op. template XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx512f {}); } const auto rotated = _mm512_alignr_epi32(_mm512_castps_si512(self), _mm512_castps_si512(self), I); return _mm_cvtss_f32(_mm512_castps512_ps128(_mm512_castsi512_ps(rotated))); } template XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx512f {}); } const auto rotated = _mm512_alignr_epi64(_mm512_castpd_si512(self), _mm512_castpd_si512(self), I); return _mm_cvtsd_f64(_mm512_castpd512_pd128(_mm512_castsi512_pd(rotated))); } template ::value>> XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(I == 0) { return first(self, avx512f {}); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { const auto rotated = _mm512_alignr_epi32(self, self, I); return first(batch(_mm512_castsi512_si128(rotated)), sse4_2 {}); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { const auto rotated = _mm512_alignr_epi64(self, self, I); return first(batch(_mm512_castsi512_si128(rotated)), sse4_2 {}); } else { // 8/16-bit lanes have no sub-dword rotate in AVX-512F; delegate to AVX halves. constexpr size_t elements_per_lane = batch::size; constexpr size_t lane = I / elements_per_lane; constexpr size_t sub_index = I % elements_per_lane; const auto half = (lane == 0) ? detail::lower_half(self) : detail::upper_half(self); return kernel::get(batch(half), ::xsimd::index {}, avx {}); } } // insert template XSIMD_INLINE batch insert(batch const& self, float val, index, requires_arch) noexcept { int32_t tmp = bit_cast(val); return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_castps_si512(self), __mmask16(1 << (I & 15)), tmp)); } template XSIMD_INLINE batch insert(batch const& self, double val, index, requires_arch) noexcept { int64_t tmp = bit_cast(val); return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_castpd_si512(self), __mmask8(1 << (I & 7)), tmp)); } template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_mask_set1_epi32(self, __mmask16(1 << (I & 15)), val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_mask_set1_epi64(self, __mmask8(1 << (I & 7)), val); } else { return insert(self, val, pos, common {}); } } // isnan template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q); } template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q); } // ldexp template XSIMD_INLINE batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { return _mm512_scalef_ps(self, _mm512_cvtepi32_ps(other)); } template XSIMD_INLINE batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { // FIXME: potential data loss here when converting other elements to // int32 before converting them back to double. __m512d adjusted_index = _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(other)); return _mm512_scalef_pd(self, adjusted_index); } // le template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ); } template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ); } template ::value>> XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } namespace detail { // Adapted from https://github.com/serge-sans-paille/fast-bitset-from-bool-array // Generate a bitset from an array of boolean. XSIMD_INLINE unsigned char tobitset(unsigned char unpacked[8]) { uint64_t data; memcpy(&data, unpacked, sizeof(uint64_t)); const uint64_t magic = (0x80 + 0x4000 + 0x200000 + 0x10000000 + 0x0800000000 + 0x040000000000 + 0x02000000000000 + 0x0100000000000000); unsigned char res = ((data * magic) >> 56) & 0xFF; return res; } } // load mask template XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { using register_type = typename batch_bool::register_type; constexpr auto size = batch_bool::size; constexpr auto iter = size / 8; static_assert((size % 8) == 0, "incorrect size of bool batch"); register_type mask = 0; for (std::size_t i = 0; i < iter; ++i) { unsigned char block = detail::tobitset((unsigned char*)mem + i * 8); mask |= (register_type(block) << (i * 8)); } return mask; } // load_aligned template ::value>> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept { return _mm512_load_si512((__m512i const*)mem); } template XSIMD_INLINE batch load_aligned(float const* mem, convert, requires_arch) noexcept { return _mm512_load_ps(mem); } template XSIMD_INLINE batch load_aligned(double const* mem, convert, requires_arch) noexcept { return _mm512_load_pd(mem); } // load_complex namespace detail { template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); auto real = _mm512_permutex2var_ps(hi, real_idx, lo); auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo); return { real, imag }; } template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14); __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15); auto real = _mm512_permutex2var_pd(hi, real_idx, lo); auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo); return { real, imag }; } } // load_unaligned template ::value>> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return _mm512_loadu_si512((__m512i const*)mem); } template XSIMD_INLINE batch load_unaligned(float const* mem, convert, requires_arch) noexcept { return _mm512_loadu_ps(mem); } template XSIMD_INLINE batch load_unaligned(double const* mem, convert, requires_arch) noexcept { return _mm512_loadu_pd(mem); } // load_stream template ::value, void>> XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept { return _mm512_stream_load_si512((__m512i*)mem); } template XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_stream_load_si512((__m512i*)mem)); } template XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_stream_load_si512((__m512i*)mem)); } // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ); } template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ); } template ::value>> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } // mask template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return self.data; } // max template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_max_ps(other, self); } template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_max_pd(other, self); } template ::value>> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_max_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_max_epi64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return max(batch(s), batch(o)); }, self, other); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_max_epu32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_max_epu64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return max(batch(s), batch(o)); }, self, other); } } } // min template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_min_ps(other, self); } template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_min_pd(other, self); } template ::value>> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_min_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_min_epi64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return min(batch(s), batch(o)); }, self, other); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_min_epu32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_min_epu64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return min(batch(s), batch(o)); }, self, other); } } } // mul template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mul_ps(self, other); } template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mul_pd(self, other); } template ::value>> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_mullo_epi32(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return mul(batch(s), batch(o)); }, self, other); } } // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); } template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); } // nearbyint_as_int template XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return _mm512_cvtps_epi32(self); } // neg template XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return 0 - self; } // neq template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_UQ); } template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_UQ); } template ::value>> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~(self == other); } template XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data ^ other.data); } // reciprocal template XSIMD_INLINE batch reciprocal(batch const& self, kernel::requires_arch) noexcept { return _mm512_rcp14_ps(self); } template XSIMD_INLINE batch reciprocal(batch const& self, kernel::requires_arch) noexcept { return _mm512_rcp14_pd(self); } // reduce_add template XSIMD_INLINE float reduce_add(batch const& rhs, requires_arch) noexcept { return _mm512_reduce_add_ps(rhs); } template XSIMD_INLINE double reduce_add(batch const& rhs, requires_arch) noexcept { return _mm512_reduce_add_pd(rhs); } template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { __m256i low = detail::lower_half(self), high = detail::upper_half(self); batch blow(low), bhigh(high); return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {}); } // reduce_max template > XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept { constexpr batch_constant mask; batch step = _mm512_permutexvar_epi64(mask.as_batch(), self); batch acc = max(self, step); __m256i low = _mm512_castsi512_si256(acc); return reduce_max(batch(low)); } // reduce_min template > XSIMD_INLINE T reduce_min(batch const& self, requires_arch) noexcept { constexpr batch_constant mask; batch step = _mm512_permutexvar_epi64(mask.as_batch(), self); batch acc = min(self, step); __m256i low = _mm512_castsi512_si256(acc); return reduce_min(batch(low)); } // reduce_mul template XSIMD_INLINE float reduce_mul(batch const& rhs, requires_arch) noexcept { return _mm512_reduce_mul_ps(rhs); } template XSIMD_INLINE double reduce_mul(batch const& rhs, requires_arch) noexcept { return _mm512_reduce_mul_pd(rhs); } template ::value>> XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_reduce_mul_epi32(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_reduce_mul_epi64(self); } else { __m256i low = detail::lower_half(self), high = detail::upper_half(self); batch blow(low), bhigh(high); return reduce_mul(blow, avx2 {}) * reduce_mul(bhigh, avx2 {}); } } // rsqrt template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm512_rsqrt14_ps(val); } template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm512_rsqrt14_pd(val); } // sadd template ::value>> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { auto mask = other < 0; auto self_pos_branch = min(std::numeric_limits::max() - other, self); auto self_neg_branch = max(std::numeric_limits::min() - other, self); return other + select(mask, self_neg_branch, self_pos_branch); } else { const auto diffmax = std::numeric_limits::max() - self; const auto mindiff = min(diffmax, other); return self + mindiff; } } // scatter template ::value || std::is_same::value>> XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i32scatter_epi32(dst, index, src, sizeof(T)); } template ::value || std::is_same::value>> XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i64scatter_epi64(dst, index, src, sizeof(T)); } template XSIMD_INLINE void scatter(batch const& src, float* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i32scatter_ps(dst, index, src, sizeof(float)); } template XSIMD_INLINE void scatter(batch const& src, double* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i64scatter_pd(dst, index, src, sizeof(double)); } // select template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm512_mask_blend_ps(cond, false_br, true_br); } template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm512_mask_blend_pd(cond, false_br, true_br); } template ::value>> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { alignas(avx2::alignment()) uint8_t buffer[64]; // FIXME: ultra inefficient for (int i = 0; i < 64; ++i) buffer[i] = cond.data & (1ull << i) ? 0xFF : 0; __m256i cond_low = batch::load_aligned(&buffer[0]); __m256i cond_hi = batch::load_aligned(&buffer[32]); __m256i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br); __m256i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br); __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {}); return detail::merge_avx(res_low, res_hi); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0)); __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0)); __m256i true_low = detail::lower_half(true_br), true_hi = detail::upper_half(true_br); __m256i false_low = detail::lower_half(false_br), false_hi = detail::upper_half(false_br); __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {}); return detail::merge_avx(res_low, res_hi); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_mask_blend_epi32(cond, false_br, true_br); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_mask_blend_epi64(cond, false_br, true_br); } else { assert(false && "unsupported arch/type combination"); return {}; } } template ::value>> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, avx512f {}); } namespace detail { template using enable_signed_integer_t = std::enable_if_t::value && std::is_signed::value, int>; template using enable_unsigned_integer_t = std::enable_if_t::value && std::is_unsigned::value, int>; } // set template XSIMD_INLINE batch set(batch const&, requires_arch, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept { return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template XSIMD_INLINE batch set(batch const&, requires_arch, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept { return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept { return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept { return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template = 0> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v32hi) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 }; #else return _mm512_set_epi16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0); #endif } template = 0> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v32hu) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 }; #else return _mm512_set_epi16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0); #endif } template = 0> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v64qi) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 }; #else return _mm512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48, v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32, v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0); #endif } template = 0> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v64qu) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 }; #else return _mm512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48, v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32, v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0); #endif } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); using register_type = typename batch_bool::register_type; register_type r = 0; unsigned shift = 0; (void)std::initializer_list { (r |= register_type(values ? 1 : 0) << (shift++))... }; return r; } // shuffle template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = (I0 & 0x3) | ((I1 & 0x3) << 2) | ((I2 & 0x3) << 4) | ((I3 & 0x3) << 6); // shuffle within lane if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I0 < 4 && I1 < 4 && I2 >= 16 && I2 < 20 && I3 >= 16 && I3 < 20) return _mm512_shuffle_ps(x, y, smask); // shuffle within opposite lane if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I2 < 4 && I3 < 4 && I0 >= 16 && I0 < 20 && I1 >= 16 && I1 < 20) return _mm512_shuffle_ps(y, x, smask); return shuffle(x, y, mask, common {}); } template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3) | ((I4 & 0x1) << 4) | ((I5 & 0x1) << 5) | ((I6 & 0x1) << 6) | ((I7 & 0x1) << 7); // shuffle within lane if (I0 < 2 && I1 >= 8 && I1 < 10 && I2 >= 2 && I2 < 4 && I3 >= 10 && I3 < 12 && I4 >= 4 && I4 < 6 && I5 >= 12 && I5 < 14 && I6 >= 6 && I6 < 8 && I7 >= 14) return _mm512_shuffle_pd(x, y, smask); // shuffle within opposite lane if (I1 < 2 && I0 >= 8 && I0 < 10 && I3 >= 2 && I3 < 4 && I2 >= 10 && I2 < 12 && I5 >= 4 && I5 < 6 && I4 >= 12 && I4 < 14 && I7 >= 6 && I7 < 8 && I6 >= 14) return _mm512_shuffle_pd(y, x, smask); return shuffle(x, y, mask, common {}); } // slide_left namespace detail { template struct make_slide_left_pattern { static constexpr size_t get(size_t i, size_t) { return i >= N ? i - N : 0; } }; template XSIMD_INLINE batch slide_left_aligned_u32(batch const& x, requires_arch) noexcept { static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits"); if (N == 0) { return x; } if (N >= 64) { return batch(T(0)); } __mmask16 mask = uint16_t(0xFFFFu << (N / 4)); if ((N & 15) == 0) { const uint8_t imm8 = uint8_t(0xe4 << (2 * (N / 16))); return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8); } auto slide_pattern = make_batch_constant, A>(); return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x); } } template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { constexpr size_t NN = N & ~3; if (N == NN || NN >= 64) { // Call fast path return detail::slide_left_aligned_u32(x, A {}); } __m512i xl = detail::slide_left_aligned_u32(_mm512_slli_epi32(x, 8 * (N - NN)), A {}); __m512i xr = detail::slide_left_aligned_u32(_mm512_srli_epi32(x, 32 - 8 * (N - NN)), A {}); return _mm512_or_epi32(xl, xr); } // slide_right namespace detail { template struct make_slide_right_pattern { static constexpr size_t get(size_t i, size_t n) { return i < (n - N) ? i + N : 0; } }; template XSIMD_INLINE batch slide_right_aligned_u32(batch const& x, requires_arch) noexcept { static_assert((N & 3) == 0 || N >= 64, "N must be aligned to 32 bits"); if (N == 0) { return x; } if (N >= 64) { return batch(T(0)); } __mmask16 mask = 0xFFFFu >> (N / 4); if ((N & 15) == 0) { const uint8_t imm8 = 0xe4 >> (2 * (N / 16)); return _mm512_maskz_shuffle_i32x4(mask, x, x, imm8); } auto slide_pattern = make_batch_constant, A>(); return _mm512_maskz_permutexvar_epi32(mask, slide_pattern.as_batch(), x); } } template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { constexpr size_t NN = N & ~3; if (N == NN || NN >= 64) { // Call fast path return detail::slide_right_aligned_u32(x, A {}); } __m512i xl = detail::slide_right_aligned_u32(_mm512_slli_epi32(x, 32 - 8 * (N - NN)), A {}); __m512i xr = detail::slide_right_aligned_u32(_mm512_srli_epi32(x, 8 * (N - NN)), A {}); return _mm512_or_epi32(xl, xr); } // sqrt template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm512_sqrt_ps(val); } template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm512_sqrt_pd(val); } // ssub template ::value>> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { return sadd(self, -other); } else { const auto diff = min(self, other); return self - diff; } } // store template XSIMD_INLINE void store(batch_bool const& self, bool* mem, requires_arch) noexcept { using register_type = typename batch_bool::register_type; constexpr auto size = batch_bool::size; for (std::size_t i = 0; i < size; ++i) mem[i] = self.data & (register_type(1) << i); } // store_aligned template ::value>> XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept { return _mm512_store_si512((__m512i*)mem, self); } template ::value>> XSIMD_INLINE void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm512_store_si512((__m512i*)mem, self); } template XSIMD_INLINE void store_aligned(float* mem, batch const& self, requires_arch) noexcept { return _mm512_store_ps(mem, self); } template XSIMD_INLINE void store_aligned(double* mem, batch const& self, requires_arch) noexcept { return _mm512_store_pd(mem, self); } // store_unaligned template ::value>> XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { return _mm512_storeu_si512((__m512i*)mem, self); } template ::value>> XSIMD_INLINE void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm512_storeu_si512((__m512i*)mem, self); } template XSIMD_INLINE void store_unaligned(float* mem, batch const& self, requires_arch) noexcept { return _mm512_storeu_ps(mem, self); } template XSIMD_INLINE void store_unaligned(double* mem, batch const& self, requires_arch) noexcept { return _mm512_storeu_pd(mem, self); } // store_stream template ::value, void>> XSIMD_INLINE void store_stream(T* mem, batch const& self, requires_arch) noexcept { _mm512_stream_si512((__m512i*)mem, self); } template XSIMD_INLINE void store_stream(float* mem, batch const& self, requires_arch) noexcept { _mm512_stream_ps(mem, self); } template XSIMD_INLINE void store_stream(double* mem, batch const& self, requires_arch) noexcept { _mm512_stream_pd(mem, self); } // sub template ::value>> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return sub(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return sub(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_sub_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_sub_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_sub_ps(self, other); } template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_sub_pd(self, other); } // swizzle (dynamic version) template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_ps(mask, self); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_pd(mask, self); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_epi64(mask, self); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_epi32(mask, self); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; } XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask)) { constexpr int imm0 = detail::mod_shuffle(V0, V1, V2, V3); constexpr int imm1 = detail::mod_shuffle(V4, V5, V6, V7); constexpr int imm2 = detail::mod_shuffle(V8, V9, V10, V11); constexpr int imm3 = detail::mod_shuffle(V12, V13, V14, V15); XSIMD_IF_CONSTEXPR(imm0 == imm1 && imm0 == imm2 && imm0 == imm3) { return _mm512_permute_ps(self, imm0); } } return swizzle(self, mask.as_batch(), avx512f {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(detail::is_identity(mask)) { return self; } XSIMD_IF_CONSTEXPR(!detail::is_cross_lane(mask)) { constexpr auto imm = ((V0 & 1) << 0) | ((V1 & 1) << 1) | ((V2 & 1) << 2) | ((V3 & 1) << 3) | ((V4 & 1) << 4) | ((V5 & 1) << 5) | ((V6 & 1) << 6) | ((V7 & 1) << 7); return _mm512_permute_pd(self, imm); } constexpr bool dup_lo = detail::is_dup_lo(mask); constexpr bool dup_hi = detail::is_dup_hi(mask); XSIMD_IF_CONSTEXPR(dup_lo || dup_hi) { const batch half = _mm512_extractf64x4_pd(self, dup_lo ? 0 : 1); constexpr std::conditional_t, batch_constant> half_mask {}; return _mm512_broadcast_f64x4(swizzle(half, half_mask, avx2 {})); } // General case return swizzle(self, mask.as_batch(), avx512f {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512f {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512f {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512f {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512f {}); } namespace detail { template struct is_pair_of_contiguous_indices; template struct is_pair_of_contiguous_indices : std::true_type { }; template struct is_pair_of_contiguous_indices : std::conditional_t<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices, std::false_type> { }; template struct fold_batch_constant { using type = batch_constant; }; template constexpr bool is_reduce_pattern() { // The actual pattern is {1, 1, 0, 1, 0, 1, ..., 0, 1} if (sizeof...(Is) != batch::size) return false; uint16_t pattern[] = { Is... }; if (pattern[0] != 1) return false; for (size_t i = 1; i < sizeof...(Is); i += 1) { if (pattern[i] != (i & 1)) return false; } return true; } } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(detail::is_pair_of_contiguous_indices::value) { constexpr typename detail::fold_batch_constant::type mask32; return _mm512_permutexvar_epi32(static_cast>(mask32), self); } else XSIMD_IF_CONSTEXPR(detail::is_reduce_pattern()) { // FIXME: this sequence is very inefficient, but it's here to catch // a pattern generated by detail::reduce from xsimd_common_math.hpp. // The whole pattern is actually decently folded by GCC and Clang, // so bare with it. constexpr batch_constant mask32; auto tmp = _mm512_permutexvar_epi32(static_cast>(mask32), self); alignas(A::alignment()) uint16_t buffer[32]; _mm512_store_si512((__m512i*)&buffer[0], tmp); buffer[0] = buffer[1]; return _mm512_load_si512(&buffer[0]); } else { return swizzle(self, mask, common {}); } } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); } // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; batch tmp_lo0[16]; for (int i = 0; i < 16; ++i) tmp_lo0[i] = _mm512_castsi512_si256(matrix_begin[i]); transpose(tmp_lo0 + 0, tmp_lo0 + 16, avx2 {}); batch tmp_hi0[16]; for (int i = 0; i < 16; ++i) tmp_hi0[i] = _mm512_castsi512_si256(matrix_begin[16 + i]); transpose(tmp_hi0 + 0, tmp_hi0 + 16, avx2 {}); batch tmp_lo1[16]; for (int i = 0; i < 16; ++i) tmp_lo1[i] = _mm512_extracti64x4_epi64(matrix_begin[i], 1); transpose(tmp_lo1 + 0, tmp_lo1 + 16, avx2 {}); batch tmp_hi1[16]; for (int i = 0; i < 16; ++i) tmp_hi1[i] = _mm512_extracti64x4_epi64(matrix_begin[16 + i], 1); transpose(tmp_hi1 + 0, tmp_hi1 + 16, avx2 {}); for (int i = 0; i < 16; ++i) matrix_begin[i] = detail::merge_avx(tmp_lo0[i], tmp_hi0[i]); for (int i = 0; i < 16; ++i) matrix_begin[i + 16] = detail::merge_avx(tmp_lo1[i], tmp_hi1[i]); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; batch tmp_lo0[32]; for (int i = 0; i < 32; ++i) tmp_lo0[i] = _mm512_castsi512_si256(matrix_begin[i]); transpose(tmp_lo0 + 0, tmp_lo0 + 32, avx2 {}); batch tmp_hi0[32]; for (int i = 0; i < 32; ++i) tmp_hi0[i] = _mm512_castsi512_si256(matrix_begin[32 + i]); transpose(tmp_hi0 + 0, tmp_hi0 + 32, avx2 {}); batch tmp_lo1[32]; for (int i = 0; i < 32; ++i) tmp_lo1[i] = _mm512_extracti64x4_epi64(matrix_begin[i], 1); transpose(tmp_lo1 + 0, tmp_lo1 + 32, avx2 {}); batch tmp_hi1[32]; for (int i = 0; i < 32; ++i) tmp_hi1[i] = _mm512_extracti64x4_epi64(matrix_begin[32 + i], 1); transpose(tmp_hi1 + 0, tmp_hi1 + 32, avx2 {}); for (int i = 0; i < 32; ++i) matrix_begin[i] = detail::merge_avx(tmp_lo0[i], tmp_hi0[i]); for (int i = 0; i < 32; ++i) matrix_begin[i + 32] = detail::merge_avx(tmp_lo1[i], tmp_hi1[i]); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } // trunc template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); } template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); } // zip_hi template ::value>> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { lo = _mm512_unpacklo_epi32(self, other); hi = _mm512_unpackhi_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { lo = _mm512_unpacklo_epi64(self, other); hi = _mm512_unpackhi_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0), _mm512_extracti32x4_epi32(lo, 3), 2), _mm512_extracti32x4_epi32(hi, 2), 1); } template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_unpacklo_ps(self, other); auto hi = _mm512_unpackhi_ps(self, other); return _mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0), _mm512_extractf32x4_ps(lo, 3), 2), _mm512_extractf32x4_ps(hi, 2), 1); } template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other)); auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other)); return _mm512_castps_pd(_mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0), _mm512_extractf32x4_ps(lo, 3), 2), _mm512_extractf32x4_ps(hi, 2), 1)); } // zip_lo template ::value>> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { lo = _mm512_unpacklo_epi32(self, other); hi = _mm512_unpackhi_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { lo = _mm512_unpacklo_epi64(self, other); hi = _mm512_unpackhi_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1), _mm512_extracti32x4_epi32(hi, 1), 3), _mm512_extracti32x4_epi32(lo, 1), 2); } template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_unpacklo_ps(self, other); auto hi = _mm512_unpackhi_ps(self, other); return _mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1), _mm512_extractf32x4_ps(hi, 1), 3), _mm512_extractf32x4_ps(lo, 1), 2); } template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other)); auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other)); return _mm512_castps_pd(_mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1), _mm512_extractf32x4_ps(hi, 1), 3), _mm512_extractf32x4_ps(lo, 1), 2)); } // widen template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { __m256i x_lo = detail::lower_half(x); __m256i x_hi = detail::upper_half(x); __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { XSIMD_IF_CONSTEXPR(std::is_signed::value) { lo = _mm512_cvtepi32_epi64(x_lo); hi = _mm512_cvtepi32_epi64(x_hi); } else { lo = _mm512_cvtepu32_epi64(x_lo); hi = _mm512_cvtepu32_epi64(x_hi); } } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { XSIMD_IF_CONSTEXPR(std::is_signed::value) { lo = _mm512_cvtepi16_epi32(x_lo); hi = _mm512_cvtepi16_epi32(x_hi); } else { lo = _mm512_cvtepu16_epi32(x_lo); hi = _mm512_cvtepu16_epi32(x_hi); } } else XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto pair_lo = widen(batch(x_lo), avx2 {}); auto pair_hi = widen(batch(x_hi), avx2 {}); return { detail::merge_avx(pair_lo[0], pair_lo[1]), detail::merge_avx(pair_hi[0], pair_hi[1]) }; } return { lo, hi }; } template XSIMD_INLINE std::array, 2> widen(batch const& x, requires_arch) noexcept { __m512d lo = _mm512_cvtps_pd(detail::lower_half(x)); __m512d hi = _mm512_cvtps_pd(detail::upper_half(x)); return { lo, hi }; } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx512ifma.hpp000066400000000000000000000016021517435117100252120ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512IFMA_HPP #define XSIMD_AVX512IFMA_HPP #include "../types/xsimd_avx512ifma_register.hpp" #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx512pf.hpp000066400000000000000000000015741517435117100247130ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512PF_HPP #define XSIMD_AVX512PF_HPP #include "../types/xsimd_avx512pf_register.hpp" #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx512vbmi.hpp000066400000000000000000000064661517435117100252500ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VBMI_HPP #define XSIMD_AVX512VBMI_HPP #include #include "../types/xsimd_avx512vbmi_register.hpp" namespace xsimd { namespace kernel { using namespace types; // slide_left template > XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency."); __mmask64 mask = 0xFFFFFFFFFFFFFFFFull << (N & 63); auto slide_pattern = make_batch_constant, A>(); return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x); } // slide_right template > XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { static_assert((N & 3) != 0 && N < 64, "The AVX512F implementation may have a lower latency."); __mmask64 mask = 0xFFFFFFFFFFFFFFFFull >> (N & 63); auto slide_pattern = make_batch_constant, A>(); return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x); } // swizzle (dynamic version) template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_epi8(mask, self); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512vbmi {})); } // swizzle (static version) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512vbmi {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512vbmi {}); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx512vbmi2.hpp000066400000000000000000000126651517435117100253300ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VBMI2_HPP #define XSIMD_AVX512VBMI2_HPP #include #include "../types/xsimd_avx512vbmi2_register.hpp" namespace xsimd { namespace kernel { using namespace types; // compress template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi16((__mmask32)mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi16((__mmask32)mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi8(mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi8(mask.mask(), self); } // expand template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi16((__mmask32)mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi16((__mmask32)mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi8(mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi8(mask.mask(), self); } // rotl template ::value>> XSIMD_INLINE batch rotl(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_shldv_epi16(self, self, _mm512_set1_epi16(static_cast(other))); } else { return rotl(self, other, avx512bw {}); } } template ::value>> XSIMD_INLINE batch rotl(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(count < bits, "Count must be less than the number of bits in T"); XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_shldi_epi16(self, self, count); } else { return rotl(self, avx512bw {}); } } // rotr template ::value>> XSIMD_INLINE batch rotr(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_shrdv_epi16(self, self, _mm512_set1_epi16(static_cast(other))); } else { return rotr(self, other, avx512bw {}); } } template ::value>> XSIMD_INLINE batch rotr(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(count < bits, "count must be less than the number of bits in T"); XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_shrdi_epi16(self, self, count); } else { return rotr(self, avx512bw {}); } } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp000066400000000000000000000016371517435117100267170ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VNNI_AVX512_BW_HPP #define XSIMD_AVX512VNNI_AVX512_BW_HPP #include "../types/xsimd_avx512vnni_avx512bw_register.hpp" #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi2.hpp000066400000000000000000000016461517435117100273260ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VNNI_AVX512VBMI2_HPP #define XSIMD_AVX512VNNI_AVX512VBMI2_HPP #include "../types/xsimd_avx512vnni_avx512vbmi2_register.hpp" #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_avxvnni.hpp000066400000000000000000000015711517435117100250250ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVXVNNI_HPP #define XSIMD_AVXVNNI_HPP #include "../types/xsimd_avxvnni_register.hpp" #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_common.hpp000066400000000000000000000024011517435117100246150ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_HPP #define XSIMD_COMMON_HPP #include "./common/xsimd_common_arithmetic.hpp" #include "./common/xsimd_common_bit.hpp" #include "./common/xsimd_common_cast.hpp" #include "./common/xsimd_common_complex.hpp" #include "./common/xsimd_common_logical.hpp" #include "./common/xsimd_common_math.hpp" #include "./common/xsimd_common_memory.hpp" #include "./common/xsimd_common_rounding.hpp" #include "./common/xsimd_common_swizzle.hpp" #include "./common/xsimd_common_trigo.hpp" #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_common_fwd.hpp000066400000000000000000000204721517435117100254650ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * Copyright (c) Marco Barbone * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_FWD_HPP #define XSIMD_COMMON_FWD_HPP #include #include namespace xsimd { // Minimal forward declarations used in this header template class batch; template class batch_bool; template struct batch_constant; template struct batch_bool_constant; template struct convert; template struct requires_arch; struct aligned_mode; struct unaligned_mode; namespace types { template struct has_simd_register; } namespace kernel { // forward declaration template ::value>> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept; template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept; template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, requires_arch) noexcept; template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept; template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, requires_arch) noexcept; template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept; template ::value>> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept; template ::value>> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept; template ::value>> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept; template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept; template ::value>> XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept; template XSIMD_INLINE batch rotl(batch const& self, STy other, requires_arch) noexcept; template XSIMD_INLINE batch rotl(batch const& self, requires_arch) noexcept; template XSIMD_INLINE batch rotr(batch const& self, STy other, requires_arch) noexcept; template XSIMD_INLINE batch rotr(batch const& self, requires_arch) noexcept; template XSIMD_INLINE batch load(T const* mem, aligned_mode, requires_arch) noexcept; template XSIMD_INLINE batch load(T const* mem, unaligned_mode, requires_arch) noexcept; template XSIMD_INLINE batch load_masked(T_in const* mem, batch_bool_constant mask, convert, alignment, requires_arch) noexcept; template XSIMD_INLINE void store_masked(T_out* mem, batch const& src, batch_bool_constant mask, alignment, requires_arch) noexcept; template XSIMD_INLINE batch load_masked(int32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; template XSIMD_INLINE batch load_masked(uint32_t const* mem, batch_bool_constant mask, convert, Mode, requires_arch) noexcept; template XSIMD_INLINE std::enable_if_t::value, batch> load_masked(int64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; template XSIMD_INLINE std::enable_if_t::value, batch> load_masked(uint64_t const*, batch_bool_constant, convert, Mode, requires_arch) noexcept; template XSIMD_INLINE void store_masked(int32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; template XSIMD_INLINE void store_masked(uint32_t* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept; template XSIMD_INLINE std::enable_if_t::value> store_masked(int64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; template XSIMD_INLINE std::enable_if_t::value> store_masked(uint64_t*, batch const&, batch_bool_constant, Mode, requires_arch) noexcept; // Forward declarations for pack-level helpers namespace detail { template XSIMD_INLINE void reassociation_barrier(T& x, const char*) noexcept; template XSIMD_INLINE void reassociation_barrier(batch& b, const char* reason) noexcept; template XSIMD_INLINE constexpr bool is_identity() noexcept; template XSIMD_INLINE constexpr bool is_identity(batch_constant) noexcept; template XSIMD_INLINE constexpr bool is_dup_lo(batch_constant) noexcept; template XSIMD_INLINE constexpr bool is_dup_hi(batch_constant) noexcept; template XSIMD_INLINE constexpr bool is_cross_lane(batch_constant) noexcept; template XSIMD_INLINE constexpr bool is_only_from_lo(batch_constant) noexcept; template XSIMD_INLINE constexpr bool is_only_from_hi(batch_constant) noexcept; } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_constants.hpp000066400000000000000000000334511517435117100253520ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NUMERICAL_CONSTANT_HPP #define XSIMD_NUMERICAL_CONSTANT_HPP #include #include "../types/xsimd_utils.hpp" namespace xsimd { namespace constants { #define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \ template \ XSIMD_INLINE T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ XSIMD_INLINE float NAME() noexcept \ { \ return SINGLE; \ } \ template <> \ XSIMD_INLINE double NAME() noexcept \ { \ return DOUBLE; \ } #define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \ template \ XSIMD_INLINE T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ XSIMD_INLINE float NAME() noexcept \ { \ return bit_cast((uint32_t)SINGLE); \ } \ template <> \ XSIMD_INLINE double NAME() noexcept \ { \ return bit_cast((uint64_t)DOUBLE); \ } // Under fast-math, GCC might replace signmask (minus zero) by zero #if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__) #pragma GCC push_options #pragma GCC optimize("signed-zeros") #endif #ifndef __FAST_MATH__ XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits::infinity()), (std::numeric_limits::infinity())) XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity()), (-infinity())) #endif XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986) XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000) XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200) XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949) XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883) XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553) XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000) XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76) XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000) XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312) XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12) XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd) XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5) XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0) XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400) XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.) XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167) XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18) XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641) XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.) XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167) XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff) XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000) XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000) XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07) XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07) XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000) XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331) XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000) XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073) XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000) XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1) XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000) XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits::min(), std::numeric_limits::min()) XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704) XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000) XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31) XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6) XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e) XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883) XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0) XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286) #if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__) #pragma GCC pop_options #endif #undef XSIMD_DEFINE_CONSTANT #undef XSIMD_DEFINE_CONSTANT_HEX template constexpr T allbits() noexcept; template constexpr as_integer_t mask1frexp() noexcept; template constexpr as_integer_t mask2frexp() noexcept; template constexpr as_integer_t maxexponent() noexcept; template constexpr as_integer_t maxexponentm1() noexcept; template constexpr int32_t nmb() noexcept; template constexpr T zero() noexcept; template constexpr T minvalue() noexcept; template constexpr T maxvalue() noexcept; /************************** * allbits implementation * **************************/ namespace detail { template ::value> struct allbits_impl { static constexpr T get_value() noexcept { return T(~0); } }; template struct allbits_impl { static constexpr T get_value() noexcept { return nan(); } }; } template XSIMD_INLINE constexpr T allbits() noexcept { return T(detail::allbits_impl::get_value()); } /***************************** * mask1frexp implementation * *****************************/ template XSIMD_INLINE constexpr as_integer_t mask1frexp() noexcept { return as_integer_t(mask1frexp()); } template <> XSIMD_INLINE constexpr int32_t mask1frexp() noexcept { return 0x7f800000; } template <> XSIMD_INLINE constexpr int64_t mask1frexp() noexcept { return 0x7ff0000000000000; } /***************************** * mask2frexp implementation * *****************************/ template XSIMD_INLINE constexpr as_integer_t mask2frexp() noexcept { return as_integer_t(mask2frexp()); } template <> XSIMD_INLINE constexpr int32_t mask2frexp() noexcept { return 0x3f000000; } template <> XSIMD_INLINE constexpr int64_t mask2frexp() noexcept { return 0x3fe0000000000000; } /****************************** * maxexponent implementation * ******************************/ template XSIMD_INLINE constexpr as_integer_t maxexponent() noexcept { return as_integer_t(maxexponent()); } template <> XSIMD_INLINE constexpr int32_t maxexponent() noexcept { return 127; } template <> XSIMD_INLINE constexpr int64_t maxexponent() noexcept { return 1023; } /****************************** * maxexponent implementation * ******************************/ template XSIMD_INLINE constexpr as_integer_t maxexponentm1() noexcept { return as_integer_t(maxexponentm1()); } template <> XSIMD_INLINE constexpr int32_t maxexponentm1() noexcept { return 126; } template <> XSIMD_INLINE constexpr int64_t maxexponentm1() noexcept { return 1022; } /********************** * nmb implementation * **********************/ template XSIMD_INLINE constexpr int32_t nmb() noexcept { return nmb(); } template <> XSIMD_INLINE constexpr int32_t nmb() noexcept { return 23; } template <> XSIMD_INLINE constexpr int32_t nmb() noexcept { return 52; } /*********************** * zero implementation * ***********************/ template XSIMD_INLINE constexpr T zero() noexcept { return T(typename T::value_type(0)); } /*************************** * minvalue implementation * ***************************/ namespace detail { template struct minvalue_impl { static constexpr T get_value() noexcept { return std::numeric_limits::min(); } }; template struct minvalue_common { static constexpr T get_value() noexcept { return std::numeric_limits::min(); } }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl { XSIMD_INLINE static float get_value() noexcept { return bit_cast((uint32_t)0xff7fffff); } }; template <> struct minvalue_impl { XSIMD_INLINE static double get_value() noexcept { return bit_cast((uint64_t)0xffefffffffffffff); } }; } template constexpr T minvalue() noexcept { return T(detail::minvalue_impl::get_value()); } /*************************** * maxvalue implementation * ***************************/ template constexpr T maxvalue() noexcept { return T(std::numeric_limits::max()); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_emulated.hpp000066400000000000000000001076451517435117100251450ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_EMULATED_HPP #define XSIMD_EMULATED_HPP #include #include #include #include "../arch/xsimd_scalar.hpp" #include "../types/xsimd_emulated_register.hpp" #include "../types/xsimd_utils.hpp" namespace xsimd { template struct batch_bool_constant; template XSIMD_INLINE batch bitwise_cast(batch const& x) noexcept; template struct batch_constant; namespace kernel { using namespace types; // fwd template XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept; template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept; namespace detail { template auto emulated_apply(F func, Bs const&... bs) { return func(bs.data[I]...); } template auto emulated_apply(F func, std::index_sequence, B const& b, Bs const&... bs) -> std::array { return { emulated_apply(func, b, bs...)... }; } template auto emulated_apply(F func, B const& b, Bs const&... bs) { return emulated_apply(func, std::make_index_sequence(), b, bs...); } } // abs template ::size> XSIMD_INLINE batch abs(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::abs(v); }, self); } // add template ::size> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::add(v0, v1); }, self, other); } // all template ::size> XSIMD_INLINE bool all(batch_bool const& self, requires_arch>) noexcept { return std::all_of(self.data.begin(), self.data.end(), [](T v) { return bool(v); }); } // any template ::size> XSIMD_INLINE bool any(batch_bool const& self, requires_arch>) noexcept { return std::any_of(self.data.begin(), self.data.end(), [](T v) { return bool(v); }); } // batch_bool_cast template ::size> XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch>) noexcept { return { self.data }; } // bitwise_and template ::size> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::bitwise_and(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::bitwise_and(v0, v1); }, self, other); } // bitwise_andnot template ::size> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::bitwise_andnot(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::bitwise_andnot(v0, v1); }, self, other); } // bitwise_lshift template ::size> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch>) noexcept { return detail::emulated_apply([other](T v) { return xsimd::bitwise_lshift(v, other); }, self); } // bitwise_not template ::size> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::bitwise_not(v); }, self); } template ::size> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch>) noexcept { return detail::emulated_apply([](bool v) { return xsimd::bitwise_not(v); }, self); } // bitwise_or template ::size> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::bitwise_or(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::bitwise_or(v0, v1); }, self, other); } // bitwise_rshift template ::size> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch>) noexcept { return detail::emulated_apply([other](T v) { return xsimd::bitwise_rshift(v, other); }, self); } // bitwise_xor template ::size> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::bitwise_xor(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::bitwise_xor(v0, v1); }, self, other); } // bitwise_cast template ::size> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array result; char* raw_data = reinterpret_cast(result.data()); const char* raw_input = reinterpret_cast(self.data.data()); memcpy(raw_data, raw_input, size * sizeof(T_out)); return result; } // broadcast template ::size> batch XSIMD_INLINE broadcast(T val, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array r; std::fill(r.begin(), r.end(), val); return r; } // first template ::size> T XSIMD_INLINE first(batch const& self, requires_arch>) noexcept { return self.data[0]; } #if 0 // count template ::size> XSIMD_INLINE size_t count(batch_bool const& x, requires_arch>) noexcept { uint64_t m = x.mask(); // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel m = m - ((m >> 1) & (uint64_t) ~(uint64_t)0 / 3); // temp m = (m & (uint64_t) ~(uint64_t)0 / 15 * 3) + ((m >> 2) & (uint64_t) ~(uint64_t)0 / 15 * 3); // temp m = (m + (m >> 4)) & (uint64_t) ~(uint64_t)0 / 255 * 15; // temp return (m * ((uint64_t) ~(uint64_t)0 / 255)) >> (sizeof(uint64_t) - 1) * CHAR_BIT; // count } #endif // store_complex namespace detail { // complex_low template ::size> XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array result; for (size_t i = 0; i < size / 2; ++i) { result[2 * i] = self.real().data[i]; result[1 + 2 * i] = self.imag().data[i]; } return result; } // complex_high template ::size> XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array result; for (size_t i = 0; i < size / 2; ++i) { result[2 * i] = self.real().data[i + size / 2]; result[1 + 2 * i] = self.imag().data[i + size / 2]; } return result; } } // decr_if template ::size> XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch>) noexcept { return self - batch(mask.data); } // div template ::size> XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::div(v0, v1); }, self, other); } // fast_cast namespace detail { template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](int32_t v) { return float(v); }, self); } template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](uint32_t v) { return float(v); }, self); } template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](int64_t v) { return double(v); }, self); } template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](uint64_t v) { return double(v); }, self); } template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](float v) { return int32_t(v); }, self); } template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](double v) { return int64_t(v); }, self); } } // eq template ::size> XSIMD_INLINE batch_bool> eq(batch> const& self, batch> const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::eq(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool> eq(batch_bool> const& self, batch_bool> const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::eq(v0, v1); }, self, other); } // from_bool template ::size> XSIMD_INLINE batch from_bool(batch_bool const& self, requires_arch>) noexcept { return detail::emulated_apply([](bool v) { return T(v); }, self); } // from_mask template ::size> XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array vmask; for (size_t i = 0; i < size; ++i) vmask[i] = (mask >> i) & 1u; return vmask; } // ge template ::size> XSIMD_INLINE batch_bool> ge(batch> const& self, batch> const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::ge(v0, v1); }, self, other); } // gt template ::size> XSIMD_INLINE batch_bool> gt(batch> const& self, batch> const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::gt(v0, v1); }, self, other); } // haddp template ::size> XSIMD_INLINE batch haddp(batch const* row, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array r; for (size_t i = 0; i < size; ++i) r[i] = std::accumulate(row[i].data.begin() + 1, row[i].data.end(), row[i].data.front()); return r; } // incr_if template ::size> XSIMD_INLINE batch incr_if(batch const& self, batch_bool const& mask, requires_arch>) noexcept { return self + batch(mask.data); } // insert template ::size> XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch>) noexcept { batch other = self; other.data[I] = val; return other; } // isnan template ::size, class = std::enable_if_t::value>> XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::isnan(v); }, self); } // load_aligned template ::size> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array res; std::copy(mem, mem + size, res.begin()); return res; } // load_unaligned template ::size> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array res; std::copy(mem, mem + size, res.begin()); return res; } // load_complex namespace detail { template ::size> XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array real, imag; for (size_t i = 0; i < size / 2; ++i) { real[i] = hi.data[2 * i]; imag[i] = hi.data[1 + 2 * i]; } for (size_t i = 0; i < size / 2; ++i) { real[size / 2 + i] = lo.data[2 * i]; imag[size / 2 + i] = lo.data[1 + 2 * i]; } return { real, imag }; } } // le template ::size> XSIMD_INLINE batch_bool> le(batch> const& self, batch> const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::le(v0, v1); }, self, other); } // lt template ::size> XSIMD_INLINE batch_bool> lt(batch> const& self, batch> const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::lt(v0, v1); }, self, other); } // mask template ::size> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch>) noexcept { constexpr size_t size = batch::size; uint64_t res = 0; for (size_t i = 0; i < size; ++i) res |= (uint64_t)(self.data[i] ? 1u : 0u) << i; return res; } // max template ::size> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::max(v0, v1); }, self, other); } // min template ::size> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::min(v0, v1); }, self, other); } // mul template ::size> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::mul(v0, v1); }, self, other); } // nearbyint_as_int template ::size> XSIMD_INLINE batch, A> nearbyint_as_int(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::nearbyint_as_int(v); }, self); } // neg template ::size> XSIMD_INLINE batch neg(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::neg(v); }, self); } // neq template ::size> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::neq(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::neq(v0, v1); }, self, other); } // reduce_add template ::size> XSIMD_INLINE T reduce_add(batch const& self, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array buffer; self.store_unaligned(buffer.data()); return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin()); } // reduce_max template ::size> XSIMD_INLINE T reduce_max(batch const& self, requires_arch>) noexcept { return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y) { return xsimd::max(x, y); }); } // reduce_min template ::size> XSIMD_INLINE T reduce_min(batch const& self, requires_arch>) noexcept { return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y) { return xsimd::min(x, y); }); } // reduce_mul template ::size> XSIMD_INLINE T reduce_mul(batch const& self, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array buffer; self.store_unaligned(buffer.data()); return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin(), std::multiplies()); } // rsqrt template ::size> XSIMD_INLINE batch rsqrt(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::rsqrt(v); }, self); } // select template ::size> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch>) noexcept { return detail::emulated_apply([](bool c, T t, T f) { return xsimd::select(c, t, f); }, cond, true_br, false_br); } template XSIMD_INLINE batch select(batch_bool_constant const& cond, batch const& true_br, batch const& false_br, requires_arch::size>>) noexcept { constexpr size_t size = batch::size; static_assert(sizeof...(Values) == size, "consistent init"); return select((batch_bool)cond, true_br, false_br, emulated<8 * sizeof(T) * size> {}); } // shuffle template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch::size>>) noexcept { constexpr size_t size = batch::size; batch bmask = mask; std::array res; for (size_t i = 0; i < size; ++i) res[i] = bmask.data[i] < size ? x.data[bmask.data[i]] : y.data[bmask.data[i] - size]; return res; } // sqrt template ::size> XSIMD_INLINE batch sqrt(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::sqrt(v); }, self); } // slide_left template ::size> XSIMD_INLINE batch slide_left(batch const& x, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array result; char* raw_data = reinterpret_cast(result.data()); memset(raw_data, 0, M); memcpy(raw_data + M, reinterpret_cast(x.data.data()), sizeof(T) * result.size() - M); return result; } // slide_right template ::size> XSIMD_INLINE batch slide_right(batch const& x, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array result; char* raw_data = reinterpret_cast(result.data()); memcpy(raw_data, reinterpret_cast(x.data.data()) + M, sizeof(T) * result.size() - M); memset(raw_data + sizeof(T) * result.size() - M, 0, M); return result; } // sadd template ::size> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::sadd(v0, v1); }, self, other); } // set template XSIMD_INLINE batch> set(batch> const&, requires_arch>, Values... values) noexcept { static_assert(sizeof...(Values) == batch>::size, "consistent init"); return { typename batch>::register_type { static_cast(values)... } }; } template XSIMD_INLINE batch_bool> set(batch_bool> const&, requires_arch>, Values... values) noexcept { static_assert(sizeof...(Values) == batch>::size, "consistent init"); return { std::array { static_cast(values)... } }; } // ssub template ::size> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::ssub(v0, v1); }, self, other); } // store_aligned template XSIMD_INLINE void store_aligned(T* mem, batch> const& self, requires_arch>) noexcept { std::copy(self.data.begin(), self.data.end(), mem); } // store_unaligned template XSIMD_INLINE void store_unaligned(T* mem, batch> const& self, requires_arch>) noexcept { std::copy(self.data.begin(), self.data.end(), mem); } // sub template ::size> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::sub(v0, v1); }, self, other); } // swizzle template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch::size>>) noexcept { constexpr size_t size = batch::size; batch bmask = mask; std::array res; for (size_t i = 0; i < size; ++i) res[i] = self.data[bmask.data[i]]; return res; } // zip_hi template ::size> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch>) noexcept { constexpr size_t size = batch::size; // Note: irregular behavior for odd numbers. std::array res; if (size % 2) { for (size_t i = 0; i < size; ++i) res[i] = (i % 2 ? self : other).data[size / 2 + i / 2]; } else { for (size_t i = 0; i < size; ++i) res[i] = (i % 2 ? other : self).data[size / 2 + i / 2]; } return res; } // zip_lo template ::size> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch>) noexcept { constexpr size_t size = batch::size; // Note: irregular behavior for odd numbers. std::array res; for (size_t i = 0; i < size; ++i) res[i] = (i % 2 ? other : self).data[i / 2]; return res; } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_fma3_avx.hpp000066400000000000000000000067541517435117100250500ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_AVX_HPP #define XSIMD_FMA3_AVX_HPP #include "../types/xsimd_fma3_avx_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fnma template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmadd_ps(x, y, z); } template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmadd_pd(x, y, z); } // fnms template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmsub_ps(x, y, z); } template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmsub_pd(x, y, z); } // fma template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmadd_ps(x, y, z); } template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmadd_pd(x, y, z); } // fms template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmsub_ps(x, y, z); } template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmsub_pd(x, y, z); } // fmas template XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmaddsub_ps(x, y, z); } template XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmaddsub_pd(x, y, z); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_fma3_avx2.hpp000066400000000000000000000030701517435117100251160ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_AVX2_HPP #define XSIMD_FMA3_AVX2_HPP #include "../types/xsimd_fma3_avx2_register.hpp" // Allow inclusion of xsimd_fma3_avx.hpp #ifdef XSIMD_FMA3_AVX_HPP #undef XSIMD_FMA3_AVX_HPP #define XSIMD_FORCE_FMA3_AVX_HPP #endif // Disallow inclusion of ./xsimd_fma3_avx_register.hpp #ifndef XSIMD_FMA3_AVX_REGISTER_HPP #define XSIMD_FMA3_AVX_REGISTER_HPP #define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP #endif // Include ./xsimd_fma3_avx.hpp but s/avx/avx2 #define avx avx2 #include "./xsimd_fma3_avx.hpp" #undef avx #undef XSIMD_FMA3_AVX_HPP // Carefully restore guards #ifdef XSIMD_FORCE_FMA3_AVX_HPP #define XSIMD_FMA3_AVX_HPP #undef XSIMD_FORCE_FMA3_AVX_HPP #endif #ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP #undef XSIMD_FMA3_AVX_REGISTER_HPP #undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP #endif #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_fma3_sse.hpp000066400000000000000000000073501517435117100250350ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_SSE_HPP #define XSIMD_FMA3_SSE_HPP #include "../types/xsimd_fma3_sse_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fnma template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmadd_ps(x, y, z); } template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmadd_pd(x, y, z); } // fnms template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmsub_ps(x, y, z); } template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmsub_pd(x, y, z); } // fma template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmadd_ps(x, y, z); } template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmadd_pd(x, y, z); } // fms template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmsub_ps(x, y, z); } template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmsub_pd(x, y, z); } // fms template XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmaddsub_ps(x, y, z); } template XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmaddsub_pd(x, y, z); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_fma4.hpp000066400000000000000000000066051517435117100241660ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA4_HPP #define XSIMD_FMA4_HPP #include "../types/xsimd_fma4_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fnma template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_nmacc_ps(x, y, z); } template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_nmacc_pd(x, y, z); } // fnms template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_nmsub_ps(x, y, z); } template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_nmsub_pd(x, y, z); } // fma template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_macc_ps(x, y, z); } template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_macc_pd(x, y, z); } // fms template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_msub_ps(x, y, z); } template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_msub_pd(x, y, z); } // fmas template XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_maddsub_ps(x, y, z); } template XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_maddsub_pd(x, y, z); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_i8mm_neon64.hpp000066400000000000000000000016051517435117100253750ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_I8MM_NEON64_HPP #define XSIMD_I8MM_NEON64_HPP #include "../types/xsimd_i8mm_neon64_register.hpp" #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_isa.hpp000066400000000000000000000055741517435117100241170ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ISA_HPP #define XSIMD_ISA_HPP #include "../config/xsimd_arch.hpp" #include "./xsimd_common_fwd.hpp" #if XSIMD_WITH_EMULATED #include "./xsimd_emulated.hpp" #endif #if XSIMD_WITH_SSE2 #include "./xsimd_sse2.hpp" #endif #if XSIMD_WITH_SSE3 #include "./xsimd_sse3.hpp" #endif #if XSIMD_WITH_SSSE3 #include "./xsimd_ssse3.hpp" #endif #if XSIMD_WITH_SSE4_1 #include "./xsimd_sse4_1.hpp" #endif #if XSIMD_WITH_SSE4_2 #include "./xsimd_sse4_2.hpp" #endif #if XSIMD_WITH_FMA3_SSE #include "./xsimd_fma3_sse.hpp" #endif #if XSIMD_WITH_FMA4 #include "./xsimd_fma4.hpp" #endif #if XSIMD_WITH_AVX #include "./xsimd_avx.hpp" #endif #if XSIMD_WITH_FMA3_AVX #include "./xsimd_fma3_avx.hpp" #endif #if XSIMD_WITH_AVXVNNI #include "./xsimd_avxvnni.hpp" #endif #if XSIMD_WITH_AVX2 #include "./xsimd_avx2.hpp" #endif #if XSIMD_WITH_FMA3_AVX2 #include "./xsimd_fma3_avx2.hpp" #endif #if XSIMD_WITH_AVX512F #include "./xsimd_avx512f.hpp" #endif #if XSIMD_WITH_AVX512DQ #include "./xsimd_avx512dq.hpp" #endif #if XSIMD_WITH_AVX512BW #include "./xsimd_avx512bw.hpp" #endif #if XSIMD_WITH_AVX512ER #include "./xsimd_avx512er.hpp" #endif #if XSIMD_WITH_AVX512PF #include "./xsimd_avx512pf.hpp" #endif #if XSIMD_WITH_AVX512IFMA #include "./xsimd_avx512ifma.hpp" #endif #if XSIMD_WITH_AVX512VBMI #include "./xsimd_avx512vbmi.hpp" #endif #if XSIMD_WITH_AVX512VBMI2 #include "./xsimd_avx512vbmi2.hpp" #endif #if XSIMD_WITH_AVX512VNNI_AVX512BW #include "./xsimd_avx512vnni_avx512bw.hpp" #endif #if XSIMD_WITH_AVX512VNNI_AVX512VBMI2 #include "./xsimd_avx512vnni_avx512vbmi2.hpp" #endif #if XSIMD_WITH_NEON #include "./xsimd_neon.hpp" #endif #if XSIMD_WITH_NEON64 #include "./xsimd_neon64.hpp" #endif #if XSIMD_WITH_I8MM_NEON64 #include "./xsimd_i8mm_neon64.hpp" #endif #if XSIMD_WITH_SVE #include "./xsimd_sve.hpp" #endif #if XSIMD_WITH_RVV #include "./xsimd_rvv.hpp" #endif #if XSIMD_WITH_WASM #include "./xsimd_wasm.hpp" #endif #if XSIMD_WITH_VSX #include "./xsimd_vsx.hpp" #endif #if XSIMD_WITH_VXE #include "./xsimd_vxe.hpp" #endif // Must come last to have access to all conversion specializations. #include "./xsimd_common.hpp" #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_neon.hpp000066400000000000000000005567001517435117100243040ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_HPP #define XSIMD_NEON_HPP #include #include #include #include #include #include "../types/xsimd_batch_fwd.hpp" #include "../types/xsimd_neon_register.hpp" #include "../types/xsimd_utils.hpp" #include "../utils/xsimd_type_traits.hpp" #include "./common/xsimd_common_bit.hpp" #include "./common/xsimd_common_cast.hpp" #include "./xsimd_common_fwd.hpp" namespace xsimd { namespace kernel { using namespace types; namespace detail { /************************************** * enabling / disabling metafunctions * **************************************/ template using enable_neon_type_t = std::enable_if_t::value || std::is_same::value, int>; template using exclude_int64_neon_t = std::enable_if_t<(std::is_integral::value && sizeof(T) != 8) || std::is_same::value, int>; } /**************** * bitwise_cast * ****************/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint8x16_t x_vreinterpretq(uint8x16_t a) noexcept { return a; } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint8x16_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_u8_s8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint8x16_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_u8_u16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint8x16_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_u8_s16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint8x16_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_u8_u32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint8x16_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_u8_s32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint8x16_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_u8_u64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint8x16_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_u8_s64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint8x16_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_u8_f32(a); } // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int8x16_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_s8_u8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int8x16_t x_vreinterpretq(int8x16_t a) noexcept { return a; } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int8x16_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_s8_u16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int8x16_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_s8_s16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int8x16_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_s8_u32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int8x16_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_s8_s32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int8x16_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_s8_u64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int8x16_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_s8_s64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int8x16_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_s8_f32(a); } // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint16x8_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_u16_u8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint16x8_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_u16_s8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint16x8_t x_vreinterpretq(uint16x8_t a) noexcept { return a; } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint16x8_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_u16_s16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint16x8_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_u16_u32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint16x8_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_u16_s32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint16x8_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_u16_u64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint16x8_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_u16_s64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint16x8_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_u16_f32(a); } // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int16x8_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_s16_u8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int16x8_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_s16_s8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int16x8_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_s16_u16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int16x8_t x_vreinterpretq(int16x8_t a) noexcept { return a; } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int16x8_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_s16_u32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int16x8_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_s16_s32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int16x8_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_s16_u64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int16x8_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_s16_s64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int16x8_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_s16_f32(a); } // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint32x4_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_u32_u8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint32x4_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_u32_s8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint32x4_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_u32_u16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint32x4_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_u32_s16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint32x4_t x_vreinterpretq(uint32x4_t a) noexcept { return a; } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint32x4_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_u32_s32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint32x4_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_u32_u64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint32x4_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_u32_s64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint32x4_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_u32_f32(a); } // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int32x4_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_s32_u8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int32x4_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_s32_s8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int32x4_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_s32_u16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int32x4_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_s32_s16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int32x4_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_s32_u32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int32x4_t x_vreinterpretq(int32x4_t a) noexcept { return a; } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int32x4_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_s32_u64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int32x4_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_s32_s64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int32x4_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_s32_f32(a); } // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint64x2_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_u64_u8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint64x2_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_u64_s8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint64x2_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_u64_u16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint64x2_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_u64_s16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint64x2_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_u64_u32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint64x2_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_u64_s32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint64x2_t x_vreinterpretq(uint64x2_t a) noexcept { return a; } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint64x2_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_u64_s64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint64x2_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_u64_f32(a); } // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int64x2_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_s64_u8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int64x2_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_s64_s8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int64x2_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_s64_u16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int64x2_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_s64_s16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int64x2_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_s64_u32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int64x2_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_s64_s32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int64x2_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_s64_u64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int64x2_t x_vreinterpretq(int64x2_t a) noexcept { return a; } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int64x2_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_s64_f32(a); } // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float32x4_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_f32_u8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float32x4_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_f32_s8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float32x4_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_f32_u16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float32x4_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_f32_s16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float32x4_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_f32_u32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float32x4_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_f32_s32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float32x4_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_f32_u64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float32x4_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_f32_s64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float32x4_t x_vreinterpretq(float32x4_t a) noexcept { return a; } } template XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { using src_register_type = typename batch::register_type; return wrap::x_vreinterpretq, map_to_sized_type_t>(src_register_type(arg)); } /************* * broadcast * *************/ template = 0> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return vdupq_n_u8(uint8_t(val)); } template = 0> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return vdupq_n_s8(int8_t(val)); } template = 0> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return vdupq_n_u16(uint16_t(val)); } template = 0> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return vdupq_n_s16(int16_t(val)); } template = 0> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return vdupq_n_u32(uint32_t(val)); } template = 0> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return vdupq_n_s32(int32_t(val)); } template = 0> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return vdupq_n_u64(uint64_t(val)); } template = 0> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return vdupq_n_s64(int64_t(val)); } template XSIMD_INLINE batch broadcast(float val, requires_arch) noexcept { return vdupq_n_f32(val); } /************* * from_bool * *************/ template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_u8(arg, vdupq_n_u8(1)); } template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vreinterpretq_s8_u8(vandq_u8(arg.data, vdupq_n_u8(1))); } template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_u16(arg, vdupq_n_u16(1)); } template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vreinterpretq_s16_u16(vandq_u16(arg.data, vdupq_n_u16(1))); } template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_u32(arg, vdupq_n_u32(1)); } template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vreinterpretq_s32_u32(vandq_u32(arg.data, vdupq_n_u32(1))); } template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vandq_u64(arg, vdupq_n_u64(1)); } template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vreinterpretq_s64_u64(vandq_u64(arg.data, vdupq_n_u64(1))); } template XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(vdupq_n_f32(1.f)))); } /******** * load * ********/ // It is not possible to use a call to A::alignment() here, so use an // immediate instead. #if defined(__clang__) || defined(__GNUC__) #define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16)) #else #define xsimd_aligned_load(inst, type, expr) inst((type)expr) #endif template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { return xsimd_aligned_load(vld1q_u8, uint8_t*, src); } template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { return xsimd_aligned_load(vld1q_s8, int8_t*, src); } template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { return xsimd_aligned_load(vld1q_u16, uint16_t*, src); } template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { return xsimd_aligned_load(vld1q_s16, int16_t*, src); } template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { return xsimd_aligned_load(vld1q_u32, uint32_t*, src); } template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { return xsimd_aligned_load(vld1q_s32, int32_t*, src); } template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { return xsimd_aligned_load(vld1q_u64, uint64_t*, src); } template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { return xsimd_aligned_load(vld1q_s64, int64_t*, src); } template XSIMD_INLINE batch load_aligned(float const* src, convert, requires_arch) noexcept { return xsimd_aligned_load(vld1q_f32, float*, src); } #undef xsimd_aligned_load template = 0> XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept { return vld1q_u8((uint8_t*)src); } template = 0> XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept { return vld1q_s8((int8_t*)src); } template = 0> XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept { return vld1q_u16((uint16_t*)src); } template = 0> XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept { return vld1q_s16((int16_t*)src); } template = 0> XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept { return vld1q_u32((uint32_t*)src); } template = 0> XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept { return vld1q_s32((int32_t*)src); } template = 0> XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept { return vld1q_u64((uint64_t*)src); } template = 0> XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept { return vld1q_s64((int64_t*)src); } template XSIMD_INLINE batch load_unaligned(float const* src, convert, requires_arch) noexcept { return vld1q_f32(src); } /* batch bool version */ template = 0> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { auto vmem = load_unaligned((unsigned char const*)mem, convert {}, A {}); auto const zero = batch { 0 }; return { (zero - vmem).data }; } template = 0> XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool t, requires_arch r) noexcept { return load_unaligned(mem, t, r); } template = 0> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { auto const vmem = batch(vmovl_u8(vld1_u8((unsigned char const*)mem))); auto const zero = batch { 0 }; return { (zero - vmem).data }; } template = 0> XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool t, requires_arch r) noexcept { return load_unaligned(mem, t, r); } template = 0> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { uint8x8_t tmp = vreinterpret_u8_u32(vset_lane_u32(*(unsigned int*)mem, vdup_n_u32(0), 0)); auto const vmem = batch(vmovl_u16(vget_low_u16(vmovl_u8(tmp)))); auto const zero = batch { 0 }; return { (zero - vmem).data }; } template = 0> XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool t, requires_arch r) noexcept { return load_unaligned(mem, t, r); } /* masked version */ namespace detail { template struct load_masked; template <> struct load_masked<> { template static XSIMD_INLINE batch apply(T const* /* mem */, batch acc, std::integral_constant) noexcept { return acc; } }; template struct load_masked { template static XSIMD_INLINE batch apply(T const* mem, batch acc, std::true_type) noexcept { return load_masked::template apply(mem, insert(acc, mem[I], index {}), std::integral_constant {}); } template static XSIMD_INLINE batch apply(T const* mem, batch acc, std::false_type) noexcept { return load_masked::template apply(mem, acc, std::integral_constant {}); } }; } template XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant /* mask */, Mode, requires_arch) noexcept { // Call insert whenever Values... are true return detail::load_masked::template apply<0>(mem, broadcast(T(0), A {}), std::integral_constant {}); } /********* * store * *********/ template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_u8((uint8_t*)dst, src); } template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_s8((int8_t*)dst, src); } template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_u16((uint16_t*)dst, src); } template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_s16((int16_t*)dst, src); } template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_u32((uint32_t*)dst, src); } template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_s32((int32_t*)dst, src); } template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_u64((uint64_t*)dst, src); } template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vst1q_s64((int64_t*)dst, src); } template XSIMD_INLINE void store_aligned(float* dst, batch const& src, requires_arch) noexcept { vst1q_f32(dst, src); } template XSIMD_INLINE void store_unaligned(T* dst, batch const& src, requires_arch) noexcept { store_aligned(dst, src, A {}); } /**************** * load_complex * ****************/ template XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept { using real_batch = batch; const float* buf = reinterpret_cast(mem); float32x4x2_t tmp = vld2q_f32(buf); real_batch real = tmp.val[0], imag = tmp.val[1]; return batch, A> { real, imag }; } template XSIMD_INLINE batch, A> load_complex_unaligned(std::complex const* mem, convert> cvt, requires_arch) noexcept { return load_complex_aligned(mem, cvt, A {}); } /***************** * store_complex * *****************/ template XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { float32x4x2_t tmp; tmp.val[0] = src.real(); tmp.val[1] = src.imag(); float* buf = reinterpret_cast(dst); vst2q_f32(buf, tmp); } template XSIMD_INLINE void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { store_complex_aligned(dst, src, A {}); } /********************* * store * *********************/ template = 0> XSIMD_INLINE void store(batch_bool b, bool* mem, requires_arch) noexcept { uint8x16_t val = vshrq_n_u8(b.data, 7); alignas(A::alignment()) uint8_t buffer[batch_bool::size]; vst1q_u8(buffer, val); memcpy(mem, buffer, sizeof(buffer)); } template = 0> XSIMD_INLINE void store(batch_bool b, bool* mem, requires_arch) noexcept { uint8x8_t val = vshr_n_u8(vqmovn_u16(b.data), 7); alignas(A::alignment()) uint8_t buffer[batch_bool::size]; vst1_u8(buffer, val); memcpy(mem, buffer, sizeof(buffer)); } template = 0> XSIMD_INLINE void store(batch_bool b, bool* mem, requires_arch) noexcept { uint8x8_t val = vshr_n_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(b.data), vdup_n_u16(0))), 7); alignas(A::alignment()) uint8_t buffer[8]; vst1_u8(buffer, val); memcpy(mem, buffer, batch_bool::size); } template = 0> XSIMD_INLINE void store(batch_bool b, bool* mem, requires_arch) noexcept { uint8x8_t val = vshr_n_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(vcombine_u32(vqmovn_u64(b.data), vdup_n_u32(0))), vdup_n_u16(0))), 7); alignas(A::alignment()) uint8_t buffer[8]; vst1_u8(buffer, val); memcpy(mem, buffer, batch_bool::size); } template XSIMD_INLINE void store(batch_bool b, bool* mem, requires_arch) noexcept { store(batch_bool(b.data), mem, A {}); } /******* * set * *******/ template = 0> XSIMD_INLINE batch set(batch const&, requires_arch req, Args... args) noexcept { alignas(A::alignment()) T data[] = { static_cast(args)... }; return load_aligned(data, {}, req); } template = 0> XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { using unsigned_type = as_unsigned_integer_t; auto const out = batch { static_cast(args ? -1LL : 0LL)... }; return { out.data }; } template XSIMD_INLINE batch set(batch const&, requires_arch req, float f0, float f1, float f2, float f3) noexcept { alignas(A::alignment()) float data[] = { f0, f1, f2, f3 }; return load_aligned(data, {}, req); } template XSIMD_INLINE batch, A> set(batch, A> const&, requires_arch, std::complex c0, std::complex c1, std::complex c2, std::complex c3) noexcept { return batch, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() }, float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() }); } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { using unsigned_type = as_unsigned_integer_t; auto const out = batch { static_cast(args ? -1LL : 0LL)... }; return { out.data }; } /******* * neg * *******/ template = 0> XSIMD_INLINE batch neg(batch const& rhs, requires_arch) noexcept { return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs))); } template = 0> XSIMD_INLINE batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_s8(rhs); } template = 0> XSIMD_INLINE batch neg(batch const& rhs, requires_arch) noexcept { return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs))); } template = 0> XSIMD_INLINE batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_s16(rhs); } template = 0> XSIMD_INLINE batch neg(batch const& rhs, requires_arch) noexcept { return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs))); } template = 0> XSIMD_INLINE batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_s32(rhs); } template = 0> XSIMD_INLINE batch neg(batch const& rhs, requires_arch) noexcept { return 0 - rhs; } template XSIMD_INLINE batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_f32(rhs); } /******* * add * *******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vaddq(uint8x16_t a, uint8x16_t b) noexcept { return vaddq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vaddq(int8x16_t a, int8x16_t b) noexcept { return vaddq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vaddq(uint16x8_t a, uint16x8_t b) noexcept { return vaddq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vaddq(int16x8_t a, int16x8_t b) noexcept { return vaddq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vaddq(uint32x4_t a, uint32x4_t b) noexcept { return vaddq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vaddq(int32x4_t a, int32x4_t b) noexcept { return vaddq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint64x2_t x_vaddq(uint64x2_t a, uint64x2_t b) noexcept { return vaddq_u64(a, b); } template ::value, int> = 0> XSIMD_INLINE int64x2_t x_vaddq(int64x2_t a, int64x2_t b) noexcept { return vaddq_s64(a, b); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vaddq(float32x4_t a, float32x4_t b) noexcept { return vaddq_f32(a, b); } } template = 0> XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vaddq>(register_type(lhs), register_type(rhs)); } /******* * avg * *******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vhaddq(uint8x16_t a, uint8x16_t b) noexcept { return vhaddq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vhaddq(uint16x8_t a, uint16x8_t b) noexcept { return vhaddq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vhaddq(uint32x4_t a, uint32x4_t b) noexcept { return vhaddq_u32(a, b); } } template ::value && sizeof(T) != 8)>> XSIMD_INLINE batch avg(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vhaddq>(register_type(lhs), register_type(rhs)); } /******** * avgr * ********/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vrhaddq(uint8x16_t a, uint8x16_t b) noexcept { return vrhaddq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vrhaddq(uint16x8_t a, uint16x8_t b) noexcept { return vrhaddq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vrhaddq(uint32x4_t a, uint32x4_t b) noexcept { return vrhaddq_u32(a, b); } } template ::value && sizeof(T) != 8)>> XSIMD_INLINE batch avgr(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vrhaddq>(register_type(lhs), register_type(rhs)); } /******** * sadd * ********/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vqaddq(uint8x16_t a, uint8x16_t b) noexcept { return vqaddq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vqaddq(int8x16_t a, int8x16_t b) noexcept { return vqaddq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vqaddq(uint16x8_t a, uint16x8_t b) noexcept { return vqaddq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vqaddq(int16x8_t a, int16x8_t b) noexcept { return vqaddq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vqaddq(uint32x4_t a, uint32x4_t b) noexcept { return vqaddq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vqaddq(int32x4_t a, int32x4_t b) noexcept { return vqaddq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint64x2_t x_vqaddq(uint64x2_t a, uint64x2_t b) noexcept { return vqaddq_u64(a, b); } template ::value, int> = 0> XSIMD_INLINE int64x2_t x_vqaddq(int64x2_t a, int64x2_t b) noexcept { return vqaddq_s64(a, b); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vqaddq(float32x4_t a, float32x4_t b) noexcept { return vaddq_f32(a, b); } } template = 0> XSIMD_INLINE batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vqaddq>(register_type(lhs), register_type(rhs)); } /******* * sub * *******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vsubq(uint8x16_t a, uint8x16_t b) noexcept { return vsubq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vsubq(int8x16_t a, int8x16_t b) noexcept { return vsubq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vsubq(uint16x8_t a, uint16x8_t b) noexcept { return vsubq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vsubq(int16x8_t a, int16x8_t b) noexcept { return vsubq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vsubq(uint32x4_t a, uint32x4_t b) noexcept { return vsubq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vsubq(int32x4_t a, int32x4_t b) noexcept { return vsubq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint64x2_t x_vsubq(uint64x2_t a, uint64x2_t b) noexcept { return vsubq_u64(a, b); } template ::value, int> = 0> XSIMD_INLINE int64x2_t x_vsubq(int64x2_t a, int64x2_t b) noexcept { return vsubq_s64(a, b); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vsubq(float32x4_t a, float32x4_t b) noexcept { return vsubq_f32(a, b); } } template = 0> XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vsubq>(register_type(lhs), register_type(rhs)); } /******** * ssub * ********/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vqsubq(uint8x16_t a, uint8x16_t b) noexcept { return vqsubq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vqsubq(int8x16_t a, int8x16_t b) noexcept { return vqsubq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vqsubq(uint16x8_t a, uint16x8_t b) noexcept { return vqsubq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vqsubq(int16x8_t a, int16x8_t b) noexcept { return vqsubq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vqsubq(uint32x4_t a, uint32x4_t b) noexcept { return vqsubq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vqsubq(int32x4_t a, int32x4_t b) noexcept { return vqsubq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint64x2_t x_vqsubq(uint64x2_t a, uint64x2_t b) noexcept { return vqsubq_u64(a, b); } template ::value, int> = 0> XSIMD_INLINE int64x2_t x_vqsubq(int64x2_t a, int64x2_t b) noexcept { return vqsubq_s64(a, b); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vqsubq(float32x4_t a, float32x4_t b) noexcept { return vsubq_f32(a, b); } } template = 0> XSIMD_INLINE batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vqsubq>(register_type(lhs), register_type(rhs)); } /******* * mul * *******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vmulq(uint8x16_t a, uint8x16_t b) noexcept { return vmulq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vmulq(int8x16_t a, int8x16_t b) noexcept { return vmulq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vmulq(uint16x8_t a, uint16x8_t b) noexcept { return vmulq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vmulq(int16x8_t a, int16x8_t b) noexcept { return vmulq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vmulq(uint32x4_t a, uint32x4_t b) noexcept { return vmulq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vmulq(int32x4_t a, int32x4_t b) noexcept { return vmulq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vmulq(float32x4_t a, float32x4_t b) noexcept { return vmulq_f32(a, b); } } template = 0> XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vmulq>(register_type(lhs), register_type(rhs)); } /******* * div * *******/ #if defined(XSIMD_FAST_INTEGER_DIVISION) template = 0> XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs)); } template = 0> XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs)); } #endif template XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html // get an initial estimate of 1/b. float32x4_t rcp = reciprocal(rhs); // use a couple Newton-Raphson steps to refine the estimate. Depending on your // application's accuracy requirements, you may be able to get away with only // one refinement (instead of the two used here). Be sure to test! rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp); rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp); // and finally, compute a / b = a * (1 / b) return vmulq_f32(lhs, rcp); } /****** * eq * ******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vceqq(uint8x16_t a, uint8x16_t b) noexcept { return vceqq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vceqq(int8x16_t a, int8x16_t b) noexcept { return vceqq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vceqq(uint16x8_t a, uint16x8_t b) noexcept { return vceqq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vceqq(int16x8_t a, int16x8_t b) noexcept { return vceqq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vceqq(uint32x4_t a, uint32x4_t b) noexcept { return vceqq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vceqq(int32x4_t a, int32x4_t b) noexcept { return vceqq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vceqq(float32x4_t a, float32x4_t b) noexcept { return vceqq_f32(a, b); } } template = 0> XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vceqq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return wrap::x_vceqq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { auto eq32 = vceqq_u32(vreinterpretq_u32_u64(lhs.data), vreinterpretq_u32_u64(rhs.data)); auto rev32 = vrev64q_u32(eq32); auto eq64 = vandq_u32(eq32, rev32); return batch_bool(vreinterpretq_u64_u32(eq64)); } template = 0> XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { auto eq32 = vceqq_u32(vreinterpretq_u32_s64(lhs.data), vreinterpretq_u32_s64(rhs.data)); auto rev32 = vrev64q_u32(eq32); auto eq64 = vandq_u32(eq32, rev32); return batch_bool(vreinterpretq_u64_u32(eq64)); } template = 0> XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return eq(batch { lhs.data }, batch { rhs.data }, A {}); } /************* * fast_cast * *************/ namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return vcvtq_f32_s32(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return vcvtq_f32_u32(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return vcvtq_s32_f32(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return vcvtq_u32_f32(self); } } /****** * lt * ******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vcltq(uint8x16_t a, uint8x16_t b) noexcept { return vcltq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vcltq(int8x16_t a, int8x16_t b) noexcept { return vcltq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vcltq(uint16x8_t a, uint16x8_t b) noexcept { return vcltq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vcltq(int16x8_t a, int16x8_t b) noexcept { return vcltq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcltq(uint32x4_t a, uint32x4_t b) noexcept { return vcltq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcltq(int32x4_t a, int32x4_t b) noexcept { return vcltq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcltq(float32x4_t a, float32x4_t b) noexcept { return vcltq_f32(a, b); } } template = 0> XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vcltq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return batch_bool(vreinterpretq_u64_s64(vshrq_n_s64(vqsubq_s64(register_type(lhs), register_type(rhs)), 63))); } template = 0> XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; register_type acc = { 0x7FFFFFFFFFFFFFFFull, 0x7FFFFFFFFFFFFFFFull }; return batch_bool(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(rhs), register_type(lhs)), acc)), 63))); } /****** * le * ******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vcleq(uint8x16_t a, uint8x16_t b) noexcept { return vcleq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vcleq(int8x16_t a, int8x16_t b) noexcept { return vcleq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vcleq(uint16x8_t a, uint16x8_t b) noexcept { return vcleq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vcleq(int16x8_t a, int16x8_t b) noexcept { return vcleq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcleq(uint32x4_t a, uint32x4_t b) noexcept { return vcleq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcleq(int32x4_t a, int32x4_t b) noexcept { return vcleq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcleq(float32x4_t a, float32x4_t b) noexcept { return vcleq_f32(a, b); } } template = 0> XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vcleq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { return !(lhs > rhs); } /****** * gt * ******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vcgtq(uint8x16_t a, uint8x16_t b) noexcept { return vcgtq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vcgtq(int8x16_t a, int8x16_t b) noexcept { return vcgtq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vcgtq(uint16x8_t a, uint16x8_t b) noexcept { return vcgtq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vcgtq(int16x8_t a, int16x8_t b) noexcept { return vcgtq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcgtq(uint32x4_t a, uint32x4_t b) noexcept { return vcgtq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcgtq(int32x4_t a, int32x4_t b) noexcept { return vcgtq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcgtq(float32x4_t a, float32x4_t b) noexcept { return vcgtq_f32(a, b); } } template = 0> XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vcgtq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return batch_bool(vreinterpretq_u64_s64(vshrq_n_s64(vqsubq_s64(register_type(rhs), register_type(lhs)), 63))); } template = 0> XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; register_type acc = { 0x7FFFFFFFFFFFFFFFull, 0x7FFFFFFFFFFFFFFFull }; return batch_bool(vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_u64(vqaddq_u64(vqsubq_u64(register_type(lhs), register_type(rhs)), acc)), 63))); } /****** * ge * ******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vcgeq(uint8x16_t a, uint8x16_t b) noexcept { return vcgeq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vcgeq(int8x16_t a, int8x16_t b) noexcept { return vcgeq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vcgeq(uint16x8_t a, uint16x8_t b) noexcept { return vcgeq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vcgeq(int16x8_t a, int16x8_t b) noexcept { return vcgeq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcgeq(uint32x4_t a, uint32x4_t b) noexcept { return vcgeq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcgeq(int32x4_t a, int32x4_t b) noexcept { return vcgeq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vcgeq(float32x4_t a, float32x4_t b) noexcept { return vcgeq_f32(a, b); } } template = 0> XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vcgeq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { return !(lhs < rhs); } /******************* * batch_bool_cast * *******************/ template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self); } /*************** * bitwise_and * ***************/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vandq(uint8x16_t a, uint8x16_t b) noexcept { return vandq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vandq(int8x16_t a, int8x16_t b) noexcept { return vandq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vandq(uint16x8_t a, uint16x8_t b) noexcept { return vandq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vandq(int16x8_t a, int16x8_t b) noexcept { return vandq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vandq(uint32x4_t a, uint32x4_t b) noexcept { return vandq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vandq(int32x4_t a, int32x4_t b) noexcept { return vandq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint64x2_t x_vandq(uint64x2_t a, uint64x2_t b) noexcept { return vandq_u64(a, b); } template ::value, int> = 0> XSIMD_INLINE int64x2_t x_vandq(int64x2_t a, int64x2_t b) noexcept { return vandq_s64(a, b); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vandq(float32x4_t a, float32x4_t b) noexcept { return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))); } } template = 0> XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vandq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return wrap::x_vandq>(register_type(lhs), register_type(rhs)); } /************** * bitwise_or * **************/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vorrq(uint8x16_t a, uint8x16_t b) noexcept { return vorrq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vorrq(int8x16_t a, int8x16_t b) noexcept { return vorrq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vorrq(uint16x8_t a, uint16x8_t b) noexcept { return vorrq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vorrq(int16x8_t a, int16x8_t b) noexcept { return vorrq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vorrq(uint32x4_t a, uint32x4_t b) noexcept { return vorrq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vorrq(int32x4_t a, int32x4_t b) noexcept { return vorrq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint64x2_t x_vorrq(uint64x2_t a, uint64x2_t b) noexcept { return vorrq_u64(a, b); } template ::value, int> = 0> XSIMD_INLINE int64x2_t x_vorrq(int64x2_t a, int64x2_t b) noexcept { return vorrq_s64(a, b); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vorrq(float32x4_t a, float32x4_t b) noexcept { return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))); } } template = 0> XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vorrq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return wrap::x_vorrq>(register_type(lhs), register_type(rhs)); } /*************** * bitwise_xor * ***************/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_veorq(uint8x16_t a, uint8x16_t b) noexcept { return veorq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_veorq(int8x16_t a, int8x16_t b) noexcept { return veorq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_veorq(uint16x8_t a, uint16x8_t b) noexcept { return veorq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_veorq(int16x8_t a, int16x8_t b) noexcept { return veorq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_veorq(uint32x4_t a, uint32x4_t b) noexcept { return veorq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_veorq(int32x4_t a, int32x4_t b) noexcept { return veorq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint64x2_t x_veorq(uint64x2_t a, uint64x2_t b) noexcept { return veorq_u64(a, b); } template ::value, int> = 0> XSIMD_INLINE int64x2_t x_veorq(int64x2_t a, int64x2_t b) noexcept { return veorq_s64(a, b); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_veorq(float32x4_t a, float32x4_t b) noexcept { return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))); } } template = 0> XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_veorq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return wrap::x_veorq>(register_type(lhs), register_type(rhs)); } /******* * neq * *******/ template XSIMD_INLINE batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return bitwise_xor(lhs, rhs, A {}); } /*************** * bitwise_not * ***************/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vmvnq(uint8x16_t a) noexcept { return vmvnq_u8(a); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vmvnq(int8x16_t a) noexcept { return vmvnq_s8(a); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vmvnq(uint16x8_t a) noexcept { return vmvnq_u16(a); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vmvnq(int16x8_t a) noexcept { return vmvnq_s16(a); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vmvnq(uint32x4_t a) noexcept { return vmvnq_u32(a); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vmvnq(int32x4_t a) noexcept { return vmvnq_s32(a); } template ::value, int> = 0> XSIMD_INLINE uint64x2_t x_vmvnq(uint64x2_t a) noexcept { return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(a))); } template ::value, int> = 0> XSIMD_INLINE int64x2_t x_vmvnq(int64x2_t a) noexcept { return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a))); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vmvnq(float32x4_t a) noexcept { return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(a))); } } template = 0> XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vmvnq>(register_type(arg)); } template = 0> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return wrap::x_vmvnq>(register_type(arg)); } /****************** * bitwise_andnot * ******************/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vbicq(uint8x16_t a, uint8x16_t b) noexcept { return vbicq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vbicq(int8x16_t a, int8x16_t b) noexcept { return vbicq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vbicq(uint16x8_t a, uint16x8_t b) noexcept { return vbicq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vbicq(int16x8_t a, int16x8_t b) noexcept { return vbicq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vbicq(uint32x4_t a, uint32x4_t b) noexcept { return vbicq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vbicq(int32x4_t a, int32x4_t b) noexcept { return vbicq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE uint64x2_t x_vbicq(uint64x2_t a, uint64x2_t b) noexcept { return vbicq_u64(a, b); } template ::value, int> = 0> XSIMD_INLINE int64x2_t x_vbicq(int64x2_t a, int64x2_t b) noexcept { return vbicq_s64(a, b); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vbicq(float32x4_t a, float32x4_t b) noexcept { return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b))); } } template = 0> XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vbicq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return wrap::x_vbicq>(register_type(lhs), register_type(rhs)); } /******* * min * *******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vminq(uint8x16_t a, uint8x16_t b) noexcept { return vminq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vminq(int8x16_t a, int8x16_t b) noexcept { return vminq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vminq(uint16x8_t a, uint16x8_t b) noexcept { return vminq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vminq(int16x8_t a, int16x8_t b) noexcept { return vminq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vminq(uint32x4_t a, uint32x4_t b) noexcept { return vminq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vminq(int32x4_t a, int32x4_t b) noexcept { return vminq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vminq(float32x4_t a, float32x4_t b) noexcept { return vminq_f32(a, b); } } template = 0> XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vminq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { return select(lhs > rhs, rhs, lhs); } /******* * max * *******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vmaxq(uint8x16_t a, uint8x16_t b) noexcept { return vmaxq_u8(a, b); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vmaxq(int8x16_t a, int8x16_t b) noexcept { return vmaxq_s8(a, b); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vmaxq(uint16x8_t a, uint16x8_t b) noexcept { return vmaxq_u16(a, b); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vmaxq(int16x8_t a, int16x8_t b) noexcept { return vmaxq_s16(a, b); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vmaxq(uint32x4_t a, uint32x4_t b) noexcept { return vmaxq_u32(a, b); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vmaxq(int32x4_t a, int32x4_t b) noexcept { return vmaxq_s32(a, b); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vmaxq(float32x4_t a, float32x4_t b) noexcept { return vmaxq_f32(a, b); } } template = 0> XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vmaxq>(register_type(lhs), register_type(rhs)); } template = 0> XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { return select(lhs > rhs, lhs, rhs); } /******* * abs * *******/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vabsq(uint8x16_t a) noexcept { return a; } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vabsq(int8x16_t a) noexcept { return vabsq_s8(a); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vabsq(uint16x8_t a) noexcept { return a; } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vabsq(int16x8_t a) noexcept { return vabsq_s16(a); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vabsq(uint32x4_t a) noexcept { return a; } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vabsq(int32x4_t a) noexcept { return vabsq_s32(a); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vabsq(float32x4_t a) noexcept { return vabsq_f32(a); } } template = 0> XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vabsq>(register_type(arg)); } /******** * rsqrt * ********/ template XSIMD_INLINE batch rsqrt(batch const& arg, requires_arch) noexcept { return vrsqrteq_f32(arg); } /******** * sqrt * ********/ template XSIMD_INLINE batch sqrt(batch const& arg, requires_arch) noexcept { batch sqrt_reciprocal = vrsqrteq_f32(arg); // one iter sqrt_reciprocal = sqrt_reciprocal * batch(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal)); batch sqrt_approx = arg * sqrt_reciprocal * batch(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal)); batch zero(0.f); return select(arg == zero, zero, sqrt_approx); } /******************** * Fused operations * ********************/ #ifdef __ARM_FEATURE_FMA template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vfmaq_f32(z, x, y); } template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vfmaq_f32(-z, x, y); } #endif /********* * haddp * *********/ template XSIMD_INLINE batch haddp(const batch* row, requires_arch) noexcept { // row = (a,b,c,d) float32x2_t tmp1, tmp2, tmp3; // tmp1 = (a0 + a2, a1 + a3) tmp1 = vpadd_f32(vget_low_f32(row[0]), vget_high_f32(row[0])); // tmp2 = (b0 + b2, b1 + b3) tmp2 = vpadd_f32(vget_low_f32(row[1]), vget_high_f32(row[1])); // tmp1 = (a0..3, b0..3) tmp1 = vpadd_f32(tmp1, tmp2); // tmp2 = (c0 + c2, c1 + c3) tmp2 = vpadd_f32(vget_low_f32(row[2]), vget_high_f32(row[2])); // tmp3 = (d0 + d2, d1 + d3) tmp3 = vpadd_f32(vget_low_f32(row[3]), vget_high_f32(row[3])); // tmp1 = (c0..3, d0..3) tmp2 = vpadd_f32(tmp2, tmp3); // return = (a0..3, b0..3, c0..3, d0..3) return vcombine_f32(tmp1, tmp2); } /************** * reciprocal * **************/ template XSIMD_INLINE batch reciprocal(const batch& x, kernel::requires_arch) noexcept { return vrecpeq_f32(x); } /********** * insert * **********/ template = 0> XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept { return vsetq_lane_u8(val, self, I); } template = 0> XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept { return vsetq_lane_s8(val, self, I); } template = 0> XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept { return vsetq_lane_u16(val, self, I); } template = 0> XSIMD_INLINE batch insert(batch const& self, int16_t val, index, requires_arch) noexcept { return vsetq_lane_s16(val, self, I); } template = 0> XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept { return vsetq_lane_u32(val, self, I); } template = 0> XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept { return vsetq_lane_s32(val, self, I); } template = 0> XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept { return vsetq_lane_u64(val, self, I); } template = 0> XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept { return vsetq_lane_s64(val, self, I); } template XSIMD_INLINE batch insert(batch const& self, float val, index, requires_arch) noexcept { return vsetq_lane_f32(val, self, I); } /******************** * nearbyint_as_int * *******************/ template XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { /* origin: https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 */ // Contributors to this work are: // John W. Ratcliff // Brandon Rowlett // Ken Fast // Eric van Beurden // Alexander Potylitsin // Hasindu Gamaarachchi // Jim Huang // Mark Cheng // Malcolm James MacLeod // Devin Hussey (easyaspi314) // Sebastian Pop // Developer Ecosystem Engineering // Danila Kutenin // François Turban (JishinMaster) // Pei-Hsuan Hung // Yang-Hao Yuan // Syoyo Fujita // Brecht Van Lommel /* * sse2neon is freely redistributable under the MIT License. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ const auto signmask = vdupq_n_u32(0x80000000); const auto half = vbslq_f32(signmask, self, vdupq_n_f32(0.5f)); /* +/- 0.5 */ const auto r_normal = vcvtq_s32_f32(vaddq_f32( self, half)); /* round to integer: [a + 0.5]*/ const auto r_trunc = vcvtq_s32_f32(self); /* truncate to integer: [a] */ const auto plusone = vreinterpretq_s32_u32(vshrq_n_u32( vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ const auto r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ const auto delta = vsubq_f32( self, vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ const auto is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ return vbslq_s32(is_delta_half, r_even, r_normal); } /************** * reduce_add * **************/ namespace detail { template XSIMD_INLINE T sum_batch(V const& arg) noexcept { T res = T(0); for (std::size_t i = 0; i < batch::size; ++i) { res += arg[i]; } return res; } } template = 0> XSIMD_INLINE typename batch::value_type reduce_add(batch const& arg, requires_arch) noexcept { uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg)); tmp = vpadd_u8(tmp, tmp); tmp = vpadd_u8(tmp, tmp); tmp = vpadd_u8(tmp, tmp); return vget_lane_u8(tmp, 0); } template = 0> XSIMD_INLINE typename batch::value_type reduce_add(batch const& arg, requires_arch) noexcept { int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg)); tmp = vpadd_s8(tmp, tmp); tmp = vpadd_s8(tmp, tmp); tmp = vpadd_s8(tmp, tmp); return vget_lane_s8(tmp, 0); } template = 0> XSIMD_INLINE typename batch::value_type reduce_add(batch const& arg, requires_arch) noexcept { uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg)); tmp = vpadd_u16(tmp, tmp); tmp = vpadd_u16(tmp, tmp); return vget_lane_u16(tmp, 0); } template = 0> XSIMD_INLINE typename batch::value_type reduce_add(batch const& arg, requires_arch) noexcept { int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg)); tmp = vpadd_s16(tmp, tmp); tmp = vpadd_s16(tmp, tmp); return vget_lane_s16(tmp, 0); } template = 0> XSIMD_INLINE typename batch::value_type reduce_add(batch const& arg, requires_arch) noexcept { uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg)); tmp = vpadd_u32(tmp, tmp); return vget_lane_u32(tmp, 0); } template = 0> XSIMD_INLINE typename batch::value_type reduce_add(batch const& arg, requires_arch) noexcept { int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg)); tmp = vpadd_s32(tmp, tmp); return vget_lane_s32(tmp, 0); } template = 0> XSIMD_INLINE typename batch::value_type reduce_add(batch const& arg, requires_arch) noexcept { return arg.get(0) + arg.get(1); } template XSIMD_INLINE float reduce_add(batch const& arg, requires_arch) noexcept { float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg)); tmp = vpadd_f32(tmp, tmp); return vget_lane_f32(tmp, 0); } /************** * reduce_max * **************/ // Using common implementation because ARM does not provide intrinsics // for this operation /************** * reduce_min * **************/ // Using common implementation because ARM does not provide intrinsics // for this operation /************** * reduce_mul * **************/ // Using common implementation because ARM does not provide intrinsics // for this operation /********** * select * **********/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_vbslq(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return vbslq_u8(a, b, c); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_vbslq(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return vbslq_s8(a, b, c); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_vbslq(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return vbslq_u16(a, b, c); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_vbslq(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return vbslq_s16(a, b, c); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_vbslq(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return vbslq_u32(a, b, c); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_vbslq(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return vbslq_s32(a, b, c); } template ::value, int> = 0> XSIMD_INLINE uint64x2_t x_vbslq(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return vbslq_u64(a, b, c); } template ::value, int> = 0> XSIMD_INLINE int64x2_t x_vbslq(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return vbslq_s64(a, b, c); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_vbslq(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return vbslq_f32(a, b, c); } } template = 0> XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { using bool_register_type = typename batch_bool::register_type; using register_type = typename batch::register_type; return wrap::x_vbslq>(bool_register_type(cond), register_type(a), register_type(b)); } template = 0> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { b... }, true_br, false_br, neon {}); } /************* * transpose * *************/ template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; auto t01 = vtrnq_f32(r0, r1); auto t23 = vtrnq_f32(r2, r3); matrix_begin[0] = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0])); matrix_begin[1] = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1])); matrix_begin[2] = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0])); matrix_begin[3] = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1])); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; auto t01 = vtrnq_u32(r0, r1); auto t23 = vtrnq_u32(r2, r3); matrix_begin[0] = vcombine_u32(vget_low_u32(t01.val[0]), vget_low_u32(t23.val[0])); matrix_begin[1] = vcombine_u32(vget_low_u32(t01.val[1]), vget_low_u32(t23.val[1])); matrix_begin[2] = vcombine_u32(vget_high_u32(t01.val[0]), vget_high_u32(t23.val[0])); matrix_begin[3] = vcombine_u32(vget_high_u32(t01.val[1]), vget_high_u32(t23.val[1])); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; auto t01 = vtrnq_s32(r0, r1); auto t23 = vtrnq_s32(r2, r3); matrix_begin[0] = vcombine_s32(vget_low_s32(t01.val[0]), vget_low_s32(t23.val[0])); matrix_begin[1] = vcombine_s32(vget_low_s32(t01.val[1]), vget_low_s32(t23.val[1])); matrix_begin[2] = vcombine_s32(vget_high_s32(t01.val[0]), vget_high_s32(t23.val[0])); matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1])); } template = 0> XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1]; matrix_begin[0] = vcombine_u64(vget_low_u64(r0), vget_low_u64(r1)); matrix_begin[1] = vcombine_u64(vget_high_u64(r0), vget_high_u64(r1)); } template = 0> XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1]; matrix_begin[0] = vcombine_s64(vget_low_s64(r0), vget_low_s64(r1)); matrix_begin[1] = vcombine_s64(vget_high_s64(r0), vget_high_s64(r1)); } /********** * zip_lo * **********/ template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs)); return vcombine_u8(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs)); return vcombine_s8(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs)); return vcombine_u16(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs)); return vcombine_s16(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs)); return vcombine_u32(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs)); return vcombine_s32(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs)); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs)); } template XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs)); return vcombine_f32(tmp.val[0], tmp.val[1]); } /********** * zip_hi * **********/ template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs)); return vcombine_u8(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs)); return vcombine_s8(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs)); return vcombine_u16(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs)); return vcombine_s16(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs)); return vcombine_u32(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs)); return vcombine_s32(tmp.val[0], tmp.val[1]); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs)); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs)); } template XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs)); return vcombine_f32(tmp.val[0], tmp.val[1]); } /**************** * extract_pair * ****************/ namespace detail { template XSIMD_INLINE batch extract_pair(batch const&, batch const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept { assert(false && "extract_pair out of bounds"); return batch {}; } template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { if (n == I) { return vextq_u8(rhs, lhs, I); } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { if (n == I) { return vextq_s8(rhs, lhs, I); } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { if (n == I) { return vextq_u16(rhs, lhs, I); } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { if (n == I) { return vextq_s16(rhs, lhs, I); } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { if (n == I) { return vextq_u32(rhs, lhs, I); } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { if (n == I) { return vextq_s32(rhs, lhs, I); } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { if (n == I) { return vextq_u64(rhs, lhs, I); } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { if (n == I) { return vextq_s64(rhs, lhs, I); } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } template XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { if (n == I) { return vextq_f32(rhs, lhs, I); } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } template XSIMD_INLINE batch extract_pair_impl(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept { if (n == 0) { return rhs; } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } } template XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) noexcept { constexpr std::size_t size = batch::size; assert(n < size && "index in bounds"); return detail::extract_pair_impl(lhs, rhs, n, std::make_index_sequence()); } /****************** * bitwise_lshift * ******************/ namespace detail { template XSIMD_INLINE batch bitwise_lshift(batch const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept { assert(false && "bitwise_lshift out of bounds"); return batch {}; } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshlq_n_u8(lhs, I); } else { return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshlq_n_s8(lhs, I); } else { return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshlq_n_u16(lhs, I); } else { return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshlq_n_s16(lhs, I); } else { return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshlq_n_u32(lhs, I); } else { return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshlq_n_s32(lhs, I); } else { return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshlq_n_u64(lhs, I); } else { return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshlq_n_s64(lhs, I); } else { return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence()); } } template XSIMD_INLINE batch bitwise_lshift_impl(batch const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept { if (n == 0) { return lhs; } else { return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence()); } } template XSIMD_INLINE bool shifts_all_positive(batch const& b) noexcept { std::array::size> tmp = {}; b.store_unaligned(tmp.begin()); return std::all_of(tmp.begin(), tmp.end(), [](T x) { return x >= 0; }); } } template XSIMD_INLINE batch bitwise_lshift(batch const& lhs, int n, requires_arch) noexcept { constexpr int size = sizeof(typename batch::value_type) * 8; assert(0 <= n && n < size && "index in bounds"); return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence()); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { // Blindly converting to signed since out of bounds shifts are UB anyways assert(detail::shifts_all_positive(rhs)); return vshlq_u8(lhs, vreinterpretq_s8_u8(rhs)); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s8(lhs, rhs); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { // Blindly converting to signed since out of bounds shifts are UB anyways assert(detail::shifts_all_positive(rhs)); return vshlq_u16(lhs, vreinterpretq_s16_u16(rhs)); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s16(lhs, rhs); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { // Blindly converting to signed since out of bounds shifts are UB anyways assert(detail::shifts_all_positive(rhs)); return vshlq_u32(lhs, vreinterpretq_s32_u32(rhs)); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s32(lhs, rhs); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { // Blindly converting to signed since out of bounds shifts are UB assert(detail::shifts_all_positive(rhs)); return vshlq_u64(lhs, vreinterpretq_s64_u64(rhs)); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s64(lhs, rhs); } // immediate variant template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& x, requires_arch) noexcept { return vshlq_n_u8(x, shift); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& x, requires_arch) noexcept { return vshlq_n_s8(x, shift); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& x, requires_arch) noexcept { return vshlq_n_u16(x, shift); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& x, requires_arch) noexcept { return vshlq_n_s16(x, shift); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& x, requires_arch) noexcept { return vshlq_n_u32(x, shift); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& x, requires_arch) noexcept { return vshlq_n_s32(x, shift); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& x, requires_arch) noexcept { return vshlq_n_u64(x, shift); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& x, requires_arch) noexcept { return vshlq_n_s64(x, shift); } /****************** * bitwise_rshift * ******************/ namespace detail { template XSIMD_INLINE batch bitwise_rshift(batch const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept { assert(false && "bitwise_rshift out of bounds"); return batch {}; } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshrq_n_u8(lhs, I); } else { return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshrq_n_s8(lhs, I); } else { return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshrq_n_u16(lhs, I); } else { return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshrq_n_s16(lhs, I); } else { return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshrq_n_u32(lhs, I); } else { return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshrq_n_s32(lhs, I); } else { return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshrq_n_u64(lhs, I); } else { return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence()); } } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, int n, ::xsimd::detail::int_sequence) noexcept { if (n == I) { return vshrq_n_s64(lhs, I); } else { return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence()); } } template XSIMD_INLINE batch bitwise_rshift_impl(batch const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept { if (n == 0) { return lhs; } else { return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence()); } } } template XSIMD_INLINE batch bitwise_rshift(batch const& lhs, int n, requires_arch) noexcept { constexpr int size = sizeof(typename batch::value_type) * 8; assert(0 <= n && n < size && "index in bounds"); return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence()); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { // Blindly converting to signed since out of bounds shifts are UB anyways assert(detail::shifts_all_positive(rhs)); return vshlq_u8(lhs, vnegq_s8(vreinterpretq_s8_u8(rhs))); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s8(lhs, vnegq_s8(rhs)); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { // Blindly converting to signed since out of bounds shifts are UB anyways assert(detail::shifts_all_positive(rhs)); return vshlq_u16(lhs, vnegq_s16(vreinterpretq_s16_u16(rhs))); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s16(lhs, vnegq_s16(rhs)); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { // Blindly converting to signed since out of bounds shifts are UB anyways assert(detail::shifts_all_positive(rhs)); return vshlq_u32(lhs, vnegq_s32(vreinterpretq_s32_u32(rhs))); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s32(lhs, vnegq_s32(rhs)); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch req) noexcept { // Blindly converting to signed since out of bounds shifts are UB anyways assert(detail::shifts_all_positive(rhs)); using S = std::make_signed_t; return vshlq_u64(lhs, neg(batch(vreinterpretq_s64_u64(rhs)), req).data); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s64(lhs, neg(rhs, neon {}).data); } // immediate variant template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& x, requires_arch) noexcept { return vshrq_n_u8(x, shift); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& x, requires_arch) noexcept { return vshrq_n_s8(x, shift); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& x, requires_arch) noexcept { return vshrq_n_u16(x, shift); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& x, requires_arch) noexcept { return vshrq_n_s16(x, shift); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& x, requires_arch) noexcept { return vshrq_n_u32(x, shift); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& x, requires_arch) noexcept { return vshrq_n_s32(x, shift); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& x, requires_arch) noexcept { return vshrq_n_u64(x, shift); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& x, requires_arch) noexcept { return vshrq_n_s64(x, shift); } // get template XSIMD_INLINE float get(batch const& self, ::xsimd::index, requires_arch) noexcept { return vgetq_lane_f32(self, I); } template = 0> XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { return vgetq_lane_u8(self, I); } template = 0> XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { return vgetq_lane_s8(self, I); } template = 0> XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { return vgetq_lane_u16(self, I); } template = 0> XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { return vgetq_lane_s16(self, I); } template = 0> XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { return vgetq_lane_u32(self, I); } template = 0> XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { return vgetq_lane_s32(self, I); } template = 0> XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { return vgetq_lane_u64(self, I); } template = 0> XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { return vgetq_lane_s64(self, I); } // first template XSIMD_INLINE float first(batch const& self, requires_arch) noexcept { return vgetq_lane_f32(self, 0); } template = 0> XSIMD_INLINE T first(batch val, requires_arch) noexcept { return vgetq_lane_u8(val, 0); } template = 0> XSIMD_INLINE T first(batch val, requires_arch) noexcept { return vgetq_lane_s8(val, 0); } template = 0> XSIMD_INLINE T first(batch val, requires_arch) noexcept { return vgetq_lane_u16(val, 0); } template = 0> XSIMD_INLINE T first(batch val, requires_arch) noexcept { return vgetq_lane_s16(val, 0); } template = 0> XSIMD_INLINE T first(batch val, requires_arch) noexcept { return vgetq_lane_u32(val, 0); } template = 0> XSIMD_INLINE T first(batch val, requires_arch) noexcept { return vgetq_lane_s32(val, 0); } template = 0> XSIMD_INLINE T first(batch val, requires_arch) noexcept { return vgetq_lane_u64(val, 0); } template = 0> XSIMD_INLINE T first(batch val, requires_arch) noexcept { return vgetq_lane_s64(val, 0); } // Overloads of bitwise shifts accepting two batches of uint64/int64 are not available with ARMv7 /******* * all * *******/ template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg)); return vget_lane_u64(tmp, 0) == ~0ULL; } template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { return all(batch_bool(vreinterpretq_u64_u8(arg)), neon {}); } template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { return all(batch_bool(vreinterpretq_u64_u16(arg)), neon {}); } template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { return all(batch_bool(vreinterpretq_u64_u32(arg)), neon {}); } /******* * any * *******/ template = 0> XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept { uint32x2_t tmp = vqmovn_u64(arg); return vget_lane_u64(vreinterpret_u64_u32(tmp), 0) != 0; } template = 0> XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept { return any(batch_bool(vreinterpretq_u64_u8(arg)), neon {}); } template = 0> XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept { return any(batch_bool(vreinterpretq_u64_u16(arg)), neon {}); } template = 0> XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept { return any(batch_bool(vreinterpretq_u64_u32(arg)), neon {}); } /********* * isnan * *********/ template XSIMD_INLINE batch_bool isnan(batch const& arg, requires_arch) noexcept { return !(arg == arg); } // slide_left namespace detail { template struct slider_left { template XSIMD_INLINE batch operator()(batch const& x, requires_arch) noexcept { const auto left = vdupq_n_u8(0); const auto right = bitwise_cast(x).data; const batch res(vextq_u8(left, right, 16 - N)); return bitwise_cast(res); } }; template <> struct slider_left<0> { template XSIMD_INLINE batch operator()(batch const& x, requires_arch) noexcept { return x; } }; } // namespace detail template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { return detail::slider_left {}(x, A {}); } // slide_right namespace detail { template struct slider_right { template XSIMD_INLINE batch operator()(batch const& x, requires_arch) noexcept { const auto left = bitwise_cast(x).data; const auto right = vdupq_n_u8(0); const batch res(vextq_u8(left, right, N)); return bitwise_cast(res); } }; template <> struct slider_right<16> { template XSIMD_INLINE batch operator()(batch const&, requires_arch) noexcept { return batch {}; } }; } // namespace detail template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { return detail::slider_right {}(x, A {}); } /**************** * rotate_left * ****************/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8x16_t x_rotate_left(uint8x16_t a, uint8x16_t b) noexcept { return vextq_u8(a, b, N); } template ::value, int> = 0> XSIMD_INLINE int8x16_t x_rotate_left(int8x16_t a, int8x16_t b) noexcept { return vextq_s8(a, b, N); } template ::value, int> = 0> XSIMD_INLINE uint16x8_t x_rotate_left(uint16x8_t a, uint16x8_t b) noexcept { return vextq_u16(a, b, N % 8); } template ::value, int> = 0> XSIMD_INLINE int16x8_t x_rotate_left(int16x8_t a, int16x8_t b) noexcept { return vextq_s16(a, b, N % 8); } template ::value, int> = 0> XSIMD_INLINE uint32x4_t x_rotate_left(uint32x4_t a, uint32x4_t b) noexcept { return vextq_u32(a, b, N % 4); } template ::value, int> = 0> XSIMD_INLINE int32x4_t x_rotate_left(int32x4_t a, int32x4_t b) noexcept { return vextq_s32(a, b, N % 4); } template ::value, int> = 0> XSIMD_INLINE uint64x2_t x_rotate_left(uint64x2_t a, uint64x2_t b) noexcept { return vextq_u64(a, b, N % 2); } template ::value, int> = 0> XSIMD_INLINE int64x2_t x_rotate_left(int64x2_t a, int64x2_t b) noexcept { return vextq_s64(a, b, N % 2); } template ::value, int> = 0> XSIMD_INLINE float32x4_t x_rotate_left(float32x4_t a, float32x4_t b) noexcept { return vextq_f32(a, b, N % 4); } } template = 0> XSIMD_INLINE batch rotate_left(batch const& a, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_rotate_left>(register_type(a), register_type(a)); } } template struct batch_constant; namespace kernel { /*********** * swizzle * ***********/ template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { static_assert(batch::size == sizeof...(idx), "valid swizzle indices"); std::array::size> data; self.store_aligned(data.data()); return set(batch(), A(), data[idx]...); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(V0 == 0 && V1 == 0) { auto lo = vget_low_u64(self); return vcombine_u64(lo, lo); } XSIMD_IF_CONSTEXPR(V0 == 1 && V1 == 1) { auto hi = vget_high_u64(self); return vcombine_u64(hi, hi); } XSIMD_IF_CONSTEXPR(V0 == 0 && V1 == 1) { return self; } else { return vextq_u64(self, self, 1); } } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return vreinterpretq_s64_u64(swizzle(vreinterpretq_u64_s64(self), mask, A {})); } namespace detail { template XSIMD_INLINE uint8x8_t make_mask() { uint8x8_t res = { static_cast((Va % 2) * 4 + 0), static_cast((Va % 2) * 4 + 1), static_cast((Va % 2) * 4 + 2), static_cast((Va % 2) * 4 + 3), static_cast((Vb % 2) * 4 + 0), static_cast((Vb % 2) * 4 + 1), static_cast((Vb % 2) * 4 + 2), static_cast((Vb % 2) * 4 + 3), }; return res; } } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { constexpr bool is_identity = detail::is_identity(mask); constexpr bool is_dup_lo = detail::is_dup_lo(mask); constexpr bool is_dup_hi = detail::is_dup_hi(mask); XSIMD_IF_CONSTEXPR(is_identity) { return self; } XSIMD_IF_CONSTEXPR(is_dup_lo) { XSIMD_IF_CONSTEXPR(V0 == 0 && V1 == 1) { return vreinterpretq_u32_u64(vdupq_lane_u64(vget_low_u64(vreinterpretq_u64_u32(self)), 0)); } XSIMD_IF_CONSTEXPR(V0 == 1 && V1 == 0) { return vreinterpretq_u32_u64(vdupq_lane_u64(vreinterpret_u64_u32(vrev64_u32(vget_low_u32(self))), 0)); } return vdupq_n_u32(vgetq_lane_u32(self, V0)); } XSIMD_IF_CONSTEXPR(is_dup_hi) { XSIMD_IF_CONSTEXPR(V0 == 2 && V1 == 3) { return vreinterpretq_u32_u64(vdupq_lane_u64(vget_high_u64(vreinterpretq_u64_u32(self)), 0)); } XSIMD_IF_CONSTEXPR(V0 == 3 && V1 == 2) { return vreinterpretq_u32_u64(vdupq_lane_u64(vreinterpret_u64_u32(vrev64_u32(vget_high_u32(self))), 0)); } return vdupq_n_u32(vgetq_lane_u32(self, V0)); } XSIMD_IF_CONSTEXPR(V0 < 2 && V1 < 2 && V2 < 2 && V3 < 2) { uint8x8_t low = vreinterpret_u8_u64(vget_low_u64(vreinterpretq_u64_u32(self))); uint8x8_t mask_lo = detail::make_mask(); uint8x8_t mask_hi = detail::make_mask(); uint8x8_t lo = vtbl1_u8(low, mask_lo); uint8x8_t hi = vtbl1_u8(low, mask_hi); return vreinterpretq_u32_u8(vcombine_u8(lo, hi)); } XSIMD_IF_CONSTEXPR(V0 >= 2 && V1 >= 2 && V2 >= 2 && V3 >= 2) { uint8x8_t high = vreinterpret_u8_u64(vget_high_u64(vreinterpretq_u64_u32(self))); uint8x8_t mask_lo = detail::make_mask(); uint8x8_t mask_hi = detail::make_mask(); uint8x8_t lo = vtbl1_u8(high, mask_lo); uint8x8_t hi = vtbl1_u8(high, mask_hi); return vreinterpretq_u32_u8(vcombine_u8(lo, hi)); } uint8x8_t mask_lo = detail::make_mask(); uint8x8_t mask_hi = detail::make_mask(); uint8x8_t low = vreinterpret_u8_u64(vget_low_u64(vreinterpretq_u64_u32(self))); uint8x8_t lol = vtbl1_u8(low, mask_lo); uint8x8_t loh = vtbl1_u8(low, mask_hi); uint32x4_t true_br = vreinterpretq_u32_u8(vcombine_u8(lol, loh)); uint8x8_t high = vreinterpret_u8_u64(vget_high_u64(vreinterpretq_u64_u32(self))); uint8x8_t hil = vtbl1_u8(high, mask_lo); uint8x8_t hih = vtbl1_u8(high, mask_hi); uint32x4_t false_br = vreinterpretq_u32_u8(vcombine_u8(hil, hih)); batch_bool_constant blend_mask; return select(blend_mask, batch(true_br), batch(false_br), A {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return vreinterpretq_s32_u32(swizzle(vreinterpretq_u32_s32(self), mask, A {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return vreinterpretq_f32_u32(swizzle(batch(vreinterpretq_u32_f32(self)), mask, A {})); } /********* * widen * *********/ template = 0> XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(vaddl_s8(vget_low_s8(x), vdup_n_s8(0))), batch, A>(vaddl_s8(vget_high_s8(x), vdup_n_s8(0))) }; } template = 0> XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(vaddl_u8(vget_low_u8(x), vdup_n_u8(0))), batch, A>(vaddl_u8(vget_high_u8(x), vdup_n_u8(0))) }; } template = 0> XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(vaddl_s16(vget_low_s16(x), vdup_n_s16(0))), batch, A>(vaddl_s16(vget_high_s16(x), vdup_n_s16(0))) }; } template = 0> XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(vaddl_u16(vget_low_u16(x), vdup_n_u16(0))), batch, A>(vaddl_u16(vget_high_u16(x), vdup_n_u16(0))) }; } template = 0> XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(vaddl_s32(vget_low_s32(x), vdup_n_s32(0))), batch, A>(vaddl_s32(vget_high_s32(x), vdup_n_s32(0))) }; } template = 0> XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(vaddl_u32(vget_low_u32(x), vdup_n_u32(0))), batch, A>(vaddl_u32(vget_high_u32(x), vdup_n_u32(0))) }; } /******** * mask * ********/ namespace detail { #ifdef XSIMD_LITTLE_ENDIAN static constexpr bool do_swap = false; #else static constexpr bool do_swap = true; #endif } template = 0> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { // From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h uint8x16_t msbs = vshrq_n_u8(self, 7); XSIMD_IF_CONSTEXPR(detail::do_swap) { msbs = vrev64q_u8(msbs); } uint64x2_t bits = vreinterpretq_u64_u8(msbs); bits = vsraq_n_u64(bits, bits, 7); bits = vsraq_n_u64(bits, bits, 14); bits = vsraq_n_u64(bits, bits, 28); uint8x16_t output = vreinterpretq_u8_u64(bits); constexpr int offset = detail::do_swap ? 7 : 0; return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 8; } template = 0> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { // Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h uint16x8_t msbs = vshrq_n_u16(self, 15); XSIMD_IF_CONSTEXPR(detail::do_swap) { msbs = vrev64q_u16(msbs); } uint64x2_t bits = vreinterpretq_u64_u16(msbs); bits = vsraq_n_u64(bits, bits, 15); bits = vsraq_n_u64(bits, bits, 30); uint8x16_t output = vreinterpretq_u8_u64(bits); constexpr int offset = detail::do_swap ? 7 : 0; return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 4; } template = 0> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { // Adapted from https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h uint32x4_t msbs = vshrq_n_u32(self, 31); XSIMD_IF_CONSTEXPR(detail::do_swap) { msbs = vrev64q_u32(msbs); } uint64x2_t bits = vreinterpretq_u64_u32(msbs); bits = vsraq_n_u64(bits, bits, 31); uint8x16_t output = vreinterpretq_u8_u64(bits); constexpr int offset = detail::do_swap ? 7 : 0; return vgetq_lane_u8(output, offset) | vgetq_lane_u8(output, offset + 8) << 2; } template = 0> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { uint64_t mask_lo = vgetq_lane_u64(self, 0); uint64_t mask_hi = vgetq_lane_u64(self, 1); return ((mask_lo >> 63) | (mask_hi << 1)) & 0x3; } /********* * count * *********/ // NOTE: Extracting a u32 for the return value saves two instructions on 32-bit ARM: // . template = 0> XSIMD_INLINE size_t count(batch_bool const& self, requires_arch) noexcept { uint8x16_t msbs = vshrq_n_u8(self, 7); uint64x2_t psum = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(msbs))); uint64x1_t total = vadd_u64(vget_low_u64(psum), vget_high_u64(psum)); assert(vget_lane_u64(total, 0) <= std::numeric_limits::max()); return vget_lane_u32(vreinterpret_u32_u64(total), 0); } template = 0> XSIMD_INLINE size_t count(batch_bool const& self, requires_arch) noexcept { uint16x8_t msbs = vshrq_n_u16(self, 15); uint64x2_t psum = vpaddlq_u32(vpaddlq_u16(msbs)); uint64x1_t total = vadd_u64(vget_low_u64(psum), vget_high_u64(psum)); assert(vget_lane_u64(total, 0) <= std::numeric_limits::max()); return vget_lane_u32(vreinterpret_u32_u64(total), 0); } template = 0> XSIMD_INLINE size_t count(batch_bool const& self, requires_arch) noexcept { uint32x4_t msbs = vshrq_n_u32(self, 31); uint64x2_t psum = vpaddlq_u32(msbs); uint64x1_t total = vadd_u64(vget_low_u64(psum), vget_high_u64(psum)); assert(vget_lane_u64(total, 0) <= std::numeric_limits::max()); return vget_lane_u32(vreinterpret_u32_u64(total), 0); } template = 0> XSIMD_INLINE size_t count(batch_bool const& self, requires_arch) noexcept { uint64x2_t msbs = vshrq_n_u64(self, 63); uint64x1_t total = vadd_u64(vget_low_u64(msbs), vget_high_u64(msbs)); assert(vget_lane_u64(total, 0) <= std::numeric_limits::max()); return vget_lane_u32(vreinterpret_u32_u64(total), 0); } #define WRAP_MASK_OP(OP) \ template = 0> \ XSIMD_INLINE size_t OP(batch_bool const& self, requires_arch) noexcept \ { \ uint8x16_t inner = self; \ XSIMD_IF_CONSTEXPR(detail::do_swap) \ { \ inner = vrev16q_u8(inner); \ } \ \ uint8x8_t narrowed = vshrn_n_u16(vreinterpretq_u16_u8(inner), 4); \ XSIMD_IF_CONSTEXPR(detail::do_swap) \ { \ narrowed = vrev64_u8(narrowed); \ } \ \ uint64_t result = vget_lane_u64(vreinterpret_u64_u8(narrowed), 0); \ return xsimd::detail::OP(result) / 4; \ } \ template = 0> \ XSIMD_INLINE size_t OP(batch_bool const& self, requires_arch) noexcept \ { \ uint8x8_t narrowed = vmovn_u16(self); \ XSIMD_IF_CONSTEXPR(detail::do_swap) \ { \ narrowed = vrev64_u8(narrowed); \ } \ \ uint64_t result = vget_lane_u64(vreinterpret_u64_u8(narrowed), 0); \ return xsimd::detail::OP(result) / 8; \ } \ template = 0> \ XSIMD_INLINE size_t OP(batch_bool const& self, requires_arch) noexcept \ { \ uint16x4_t narrowed = vmovn_u32(self); \ XSIMD_IF_CONSTEXPR(detail::do_swap) \ { \ narrowed = vrev64_u16(narrowed); \ } \ \ uint64_t result = vget_lane_u64(vreinterpret_u64_u16(narrowed), 0); \ return xsimd::detail::OP(result) / 16; \ } \ template = 0> \ XSIMD_INLINE size_t OP(batch_bool const& self, requires_arch) noexcept \ { \ uint32x2_t narrowed = vmovn_u64(self); \ XSIMD_IF_CONSTEXPR(detail::do_swap) \ { \ narrowed = vrev64_u32(narrowed); \ } \ \ uint64_t result = vget_lane_u64(vreinterpret_u64_u32(narrowed), 0); \ return xsimd::detail::OP(result) / 32; \ } WRAP_MASK_OP(countl_zero) WRAP_MASK_OP(countl_one) WRAP_MASK_OP(countr_zero) WRAP_MASK_OP(countr_one) #undef WRAP_MASK_OP } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_neon64.hpp000066400000000000000000002102131517435117100244400ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON64_HPP #define XSIMD_NEON64_HPP #include #include #include #include #include #include "../types/xsimd_neon64_register.hpp" #include "../types/xsimd_utils.hpp" #include "./xsimd_neon.hpp" namespace xsimd { template struct batch_bool_constant; namespace kernel { using namespace types; namespace detail { template using enable_neon64_type_t = std::enable_if_t::value || std::is_same::value || std::is_same::value, int>; } // get template XSIMD_INLINE double get(batch const& self, ::xsimd::index, requires_arch) noexcept { return vgetq_lane_f64(self, I); } // first template XSIMD_INLINE double first(batch const& self, requires_arch) noexcept { return vgetq_lane_f64(self, 0); } /******* * all * *******/ template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { return vminvq_u32(arg) == ~0U; } template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { return all(batch_bool(vreinterpretq_u32_u8(arg)), neon64 {}); } template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { return all(batch_bool(vreinterpretq_u32_u16(arg)), neon64 {}); } template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { return all(batch_bool(vreinterpretq_u32_u64(arg)), neon64 {}); } /******* * any * *******/ template = 0> XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept { return vmaxvq_u32(arg) != 0; } template = 0> XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept { return any(batch_bool(vreinterpretq_u32_u8(arg)), neon64 {}); } template = 0> XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept { return any(batch_bool(vreinterpretq_u32_u16(arg)), neon64 {}); } template = 0> XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept { return any(batch_bool(vreinterpretq_u32_u64(arg)), neon64 {}); } /************* * broadcast * *************/ // Required to avoid ambiguous call template XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return broadcast(val, neon {}); } template XSIMD_INLINE batch broadcast(double val, requires_arch) noexcept { return vdupq_n_f64(val); } /************* * from_bool * *************/ template XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(vdupq_n_f64(1.)))); } /******** * load * ********/ #if defined(__clang__) || defined(__GNUC__) #define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16)) #else #define xsimd_aligned_load(inst, type, expr) inst((type)expr) #endif template XSIMD_INLINE batch load_aligned(double const* src, convert, requires_arch) noexcept { return xsimd_aligned_load(vld1q_f64, double*, src); } template XSIMD_INLINE batch load_unaligned(double const* src, convert, requires_arch) noexcept { return vld1q_f64(src); } #undef xsimd_aligned_load /********* * store * *********/ template XSIMD_INLINE void store_aligned(double* dst, batch const& src, requires_arch) noexcept { vst1q_f64(dst, src); } template XSIMD_INLINE void store_unaligned(double* dst, batch const& src, requires_arch) noexcept { return store_aligned(dst, src, A {}); } /**************** * store_stream * ****************/ #if defined(__GNUC__) template XSIMD_INLINE void store_stream(float* mem, batch const& val, requires_arch) noexcept { float32x2_t lo = vget_low_f32(val); float32x2_t hi = vget_high_f32(val); __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]" : : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem) : "memory"); } template XSIMD_INLINE void store_stream(double* mem, batch const& val, requires_arch) noexcept { float64x1_t lo = vget_low_f64(val); float64x1_t hi = vget_high_f64(val); __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]" : : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem) : "memory"); } template ::value, void>::type> XSIMD_INLINE void store_stream(T* mem, batch const& val, requires_arch) noexcept { uint64x2_t u64; std::memcpy(&u64, &val, sizeof(u64)); uint64x1_t lo = vget_low_u64(u64); uint64x1_t hi = vget_high_u64(u64); __asm__ __volatile__("stnp %d[lo], %d[hi], [%[mem]]" : : [lo] "w"(lo), [hi] "w"(hi), [mem] "r"(mem) : "memory"); } #endif /*************** * load_stream * ***************/ #if defined(__GNUC__) template XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept { float32x2_t lo, hi; __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]" : [lo] "=w"(lo), [hi] "=w"(hi) : [mem] "r"(mem) : "memory"); return vcombine_f32(lo, hi); } template XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept { float64x1_t lo, hi; __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]" : [lo] "=w"(lo), [hi] "=w"(hi) : [mem] "r"(mem) : "memory"); return vcombine_f64(lo, hi); } template ::value, void>::type> XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept { uint64x1_t lo, hi; __asm__ __volatile__("ldnp %d[lo], %d[hi], [%[mem]]" : [lo] "=w"(lo), [hi] "=w"(hi) : [mem] "r"(mem) : "memory"); uint64x2_t u64 = vcombine_u64(lo, hi); batch result; std::memcpy(&result, &u64, sizeof(u64)); return result; } #endif /********************* * store * *********************/ template XSIMD_INLINE void store(batch_bool b, bool* mem, requires_arch) noexcept { store(batch_bool(b.data), mem, A {}); } /**************** * load_complex * ****************/ template XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept { using real_batch = batch; const double* buf = reinterpret_cast(mem); float64x2x2_t tmp = vld2q_f64(buf); real_batch real = tmp.val[0], imag = tmp.val[1]; return batch, A> { real, imag }; } template XSIMD_INLINE batch, A> load_complex_unaligned(std::complex const* mem, convert> cvt, requires_arch) noexcept { return load_complex_aligned(mem, cvt, A {}); } /***************** * store_complex * *****************/ template XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { float64x2x2_t tmp; tmp.val[0] = src.real(); tmp.val[1] = src.imag(); double* buf = reinterpret_cast(dst); vst2q_f64(buf, tmp); } template XSIMD_INLINE void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { store_complex_aligned(dst, src, A {}); } /******* * set * *******/ template XSIMD_INLINE batch set(batch const&, requires_arch req, double d0, double d1) noexcept { alignas(A::alignment()) double data[] = { d0, d1 }; return load_aligned(data, {}, req); } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, bool b0, bool b1) noexcept { using unsigned_type = as_unsigned_integer_t; auto const out = batch { static_cast(b0 ? -1LL : 0LL), static_cast(b1 ? -1LL : 0LL) }; return { out.data }; } /******* * neg * *******/ template = 0> XSIMD_INLINE batch neg(batch const& rhs, requires_arch) noexcept { return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs))); } template = 0> XSIMD_INLINE batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_s64(rhs); } template XSIMD_INLINE batch neg(batch const& rhs, requires_arch) noexcept { return vnegq_f64(rhs); } /******* * add * *******/ template XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vaddq_f64(lhs, rhs); } /******** * sadd * ********/ template XSIMD_INLINE batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept { return add(lhs, rhs, neon64 {}); } /******* * sub * *******/ template XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vsubq_f64(lhs, rhs); } /******** * ssub * ********/ template XSIMD_INLINE batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept { return sub(lhs, rhs, neon64 {}); } /******* * mul * *******/ template XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vmulq_f64(lhs, rhs); } /******* * div * *******/ #if defined(XSIMD_FAST_INTEGER_DIVISION) template = 0> XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs)); } template = 0> XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs)); } #endif template XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vdivq_f64(lhs, rhs); } /****** * eq * ******/ template = 0> XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vceqq_u64(lhs, rhs); } template = 0> XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vceqq_s64(lhs, rhs); } template XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vceqq_f64(lhs, rhs); } template = 0> XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vceqq_u64(lhs, rhs); } template = 0> XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vceqq_u64(lhs, rhs); } template XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vceqq_u64(lhs, rhs); } /************* * fast_cast * *************/ namespace detail { template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { return vcvtq_f64_s64(x); } template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { return vcvtq_f64_u64(x); } template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { return vcvtq_s64_f64(x); } template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { return vcvtq_u64_f64(x); } } /****** * lt * ******/ template = 0> XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcltq_u64(lhs, rhs); } template = 0> XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcltq_s64(lhs, rhs); } template XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcltq_f64(lhs, rhs); } /****** * le * ******/ template = 0> XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcleq_u64(lhs, rhs); } template = 0> XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcleq_s64(lhs, rhs); } template XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcleq_f64(lhs, rhs); } /****** * gt * ******/ template = 0> XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgtq_u64(lhs, rhs); } template = 0> XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgtq_s64(lhs, rhs); } template XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgtq_f64(lhs, rhs); } /****** * ge * ******/ template = 0> XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgeq_u64(lhs, rhs); } template = 0> XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgeq_s64(lhs, rhs); } template XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vcgeq_f64(lhs, rhs); } /******************* * batch_bool_cast * *******************/ template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self); } /*************** * bitwise_and * ***************/ template XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vandq_u64(lhs, rhs); } /************** * bitwise_or * **************/ template XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vorrq_u64(lhs, rhs); } /*************** * bitwise_xor * ***************/ template XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return veorq_u64(lhs, rhs); } /******* * neq * *******/ template XSIMD_INLINE batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return bitwise_xor(lhs, rhs, A {}); } /*************** * bitwise_not * ***************/ template XSIMD_INLINE batch bitwise_not(batch const& rhs, requires_arch) noexcept { return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs))); } template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& rhs, requires_arch) noexcept { return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(rhs))); } /****************** * bitwise_andnot * ******************/ template XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs), vreinterpretq_u64_f64(rhs))); } template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return vbicq_u64(lhs, rhs); } /******* * min * *******/ template XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vminq_f64(lhs, rhs); } /******* * max * *******/ template XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vmaxq_f64(lhs, rhs); } /******** * mask * ********/ template = 0> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { // From https://github.com/DLTcollab/sse2neon/blob/master/sse2neon.h // Extract most significant bit uint8x16_t msbs = vshrq_n_u8(self, 7); // Position it appropriately static constexpr int8_t shift_table[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 }; int8x16_t shifts = vld1q_s8(shift_table); uint8x16_t positioned = vshlq_u8(msbs, shifts); // Horizontal reduction return vaddv_u8(vget_low_u8(positioned)) | (vaddv_u8(vget_high_u8(positioned)) << 8); } template = 0> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { // Extract most significant bit uint16x8_t msbs = vshrq_n_u16(self, 15); // Position it appropriately static constexpr int16_t shift_table[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; int16x8_t shifts = vld1q_s16(shift_table); uint16x8_t positioned = vshlq_u16(msbs, shifts); // Horizontal reduction return vaddvq_u16(positioned); } template = 0> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { // Extract most significant bit uint32x4_t msbs = vshrq_n_u32(self, 31); // Position it appropriately static constexpr int32_t shift_table[4] = { 0, 1, 2, 3 }; int32x4_t shifts = vld1q_s32(shift_table); uint32x4_t positioned = vshlq_u32(msbs, shifts); // Horizontal reduction return vaddvq_u32(positioned); } /********* * count * *********/ template = 0> XSIMD_INLINE size_t count(batch_bool const& self, requires_arch) noexcept { return vaddvq_u8(vshrq_n_u8(self, 7)); } template = 0> XSIMD_INLINE size_t count(batch_bool const& self, requires_arch) noexcept { return vaddvq_u16(vshrq_n_u16(self, 15)); } template = 0> XSIMD_INLINE size_t count(batch_bool const& self, requires_arch) noexcept { return vaddvq_u32(vshrq_n_u32(self, 31)); } template = 0> XSIMD_INLINE size_t count(batch_bool const& self, requires_arch) noexcept { return vaddvq_u64(vshrq_n_u64(self, 63)); } /******* * abs * *******/ template = 0> XSIMD_INLINE batch abs(batch const& rhs, requires_arch) noexcept { return rhs; } template = 0> XSIMD_INLINE batch abs(batch const& rhs, requires_arch) noexcept { return vabsq_s64(rhs); } template XSIMD_INLINE batch abs(batch const& rhs, requires_arch) noexcept { return vabsq_f64(rhs); } template XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return vcvtnq_s32_f32(self); } #if !defined(__GNUC__) template XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return vcvtnq_s64_f64(self); } #endif /************** * reciprocal * **************/ template XSIMD_INLINE batch reciprocal(const batch& x, kernel::requires_arch) noexcept { return vrecpeq_f64(x); } /******** * rsqrt * ********/ template XSIMD_INLINE batch rsqrt(batch const& rhs, requires_arch) noexcept { return vrsqrteq_f64(rhs); } /******** * sqrt * ********/ template XSIMD_INLINE batch sqrt(batch const& rhs, requires_arch) noexcept { return vsqrtq_f64(rhs); } /******************** * Fused operations * ********************/ #ifdef __ARM_FEATURE_FMA template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vfmaq_f64(z, x, y); } template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vfmaq_f64(-z, x, y); } #endif /********* * haddp * *********/ template XSIMD_INLINE batch haddp(const batch* row, requires_arch) noexcept { return vpaddq_f64(row[0], row[1]); } /********** * insert * **********/ template XSIMD_INLINE batch insert(batch const& self, double val, index, requires_arch) noexcept { return vsetq_lane_f64(val, self, I); } /************** * reduce_add * **************/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8_t x_vaddvq(uint8x16_t a) noexcept { return vaddvq_u8(a); } template ::value, int> = 0> XSIMD_INLINE int8_t x_vaddvq(int8x16_t a) noexcept { return vaddvq_s8(a); } template ::value, int> = 0> XSIMD_INLINE uint16_t x_vaddvq(uint16x8_t a) noexcept { return vaddvq_u16(a); } template ::value, int> = 0> XSIMD_INLINE int16_t x_vaddvq(int16x8_t a) noexcept { return vaddvq_s16(a); } template ::value, int> = 0> XSIMD_INLINE uint32_t x_vaddvq(uint32x4_t a) noexcept { return vaddvq_u32(a); } template ::value, int> = 0> XSIMD_INLINE int32_t x_vaddvq(int32x4_t a) noexcept { return vaddvq_s32(a); } template ::value, int> = 0> XSIMD_INLINE uint64_t x_vaddvq(uint64x2_t a) noexcept { return vaddvq_u64(a); } template ::value, int> = 0> XSIMD_INLINE int64_t x_vaddvq(int64x2_t a) noexcept { return vaddvq_s64(a); } template ::value, int> = 0> XSIMD_INLINE float x_vaddvq(float32x4_t a) noexcept { return vaddvq_f32(a); } template ::value, int> = 0> XSIMD_INLINE double x_vaddvq(float64x2_t a) noexcept { return vaddvq_f64(a); } } template = 0> XSIMD_INLINE typename batch::value_type reduce_add(batch const& arg, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vaddvq(register_type(arg)); } /************** * reduce_max * **************/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8_t x_vmaxvq(uint8x16_t a) noexcept { return vmaxvq_u8(a); } template ::value, int> = 0> XSIMD_INLINE int8_t x_vmaxvq(int8x16_t a) noexcept { return vmaxvq_s8(a); } template ::value, int> = 0> XSIMD_INLINE uint16_t x_vmaxvq(uint16x8_t a) noexcept { return vmaxvq_u16(a); } template ::value, int> = 0> XSIMD_INLINE int16_t x_vmaxvq(int16x8_t a) noexcept { return vmaxvq_s16(a); } template ::value, int> = 0> XSIMD_INLINE uint32_t x_vmaxvq(uint32x4_t a) noexcept { return vmaxvq_u32(a); } template ::value, int> = 0> XSIMD_INLINE int32_t x_vmaxvq(int32x4_t a) noexcept { return vmaxvq_s32(a); } template ::value, int> = 0> XSIMD_INLINE float x_vmaxvq(float32x4_t a) noexcept { return vmaxvq_f32(a); } template ::value, int> = 0> XSIMD_INLINE double x_vmaxvq(float64x2_t a) noexcept { return vmaxvq_f64(a); } template ::value, int> = 0> XSIMD_INLINE uint64_t x_vmaxvq(uint64x2_t a) noexcept { return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1)); } template ::value, int> = 0> XSIMD_INLINE int64_t x_vmaxvq(int64x2_t a) noexcept { return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1)); } } template = 0> XSIMD_INLINE typename batch::value_type reduce_max(batch const& arg, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vmaxvq(register_type(arg)); } /************** * reduce_min * **************/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value, int> = 0> XSIMD_INLINE uint8_t x_vminvq(uint8x16_t a) noexcept { return vminvq_u8(a); } template ::value, int> = 0> XSIMD_INLINE int8_t x_vminvq(int8x16_t a) noexcept { return vminvq_s8(a); } template ::value, int> = 0> XSIMD_INLINE uint16_t x_vminvq(uint16x8_t a) noexcept { return vminvq_u16(a); } template ::value, int> = 0> XSIMD_INLINE int16_t x_vminvq(int16x8_t a) noexcept { return vminvq_s16(a); } template ::value, int> = 0> XSIMD_INLINE uint32_t x_vminvq(uint32x4_t a) noexcept { return vminvq_u32(a); } template ::value, int> = 0> XSIMD_INLINE int32_t x_vminvq(int32x4_t a) noexcept { return vminvq_s32(a); } template ::value, int> = 0> XSIMD_INLINE float x_vminvq(float32x4_t a) noexcept { return vminvq_f32(a); } template ::value, int> = 0> XSIMD_INLINE double x_vminvq(float64x2_t a) noexcept { return vminvq_f64(a); } template ::value, int> = 0> XSIMD_INLINE uint64_t x_vminvq(uint64x2_t a) noexcept { return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1)); } template ::value, int> = 0> XSIMD_INLINE int64_t x_vminvq(int64x2_t a) noexcept { return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1)); } } template = 0> XSIMD_INLINE typename batch::value_type reduce_min(batch const& arg, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vminvq(register_type(arg)); } /********** * select * **********/ template XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { return vbslq_f64(cond, a, b); } template XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { b... }, true_br, false_br, neon64 {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1]; matrix_begin[0] = vzip1q_f64(r0, r1); matrix_begin[1] = vzip2q_f64(r0, r1); } template = 0> XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1]; matrix_begin[0] = vzip1q_u64(r0, r1); matrix_begin[1] = vzip2q_u64(r0, r1); } template = 0> XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1]; matrix_begin[0] = vzip1q_s64(r0, r1); matrix_begin[1] = vzip2q_s64(r0, r1); } /********** * zip_lo * **********/ template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_u8(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_s8(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_u16(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_s16(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_u32(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_s32(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_u64(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_s64(lhs, rhs); } template XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_f32(lhs, rhs); } template XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip1q_f64(lhs, rhs); } /********** * zip_hi * **********/ template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_u8(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_s8(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_u16(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_s16(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_u32(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_s32(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_u64(lhs, rhs); } template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_s64(lhs, rhs); } template XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_f32(lhs, rhs); } template XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vzip2q_f64(lhs, rhs); } /**************** * extract_pair * ****************/ namespace detail { template XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { if (n == I) { return vextq_f64(rhs, lhs, I); } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } } template XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) noexcept { constexpr std::size_t size = batch::size; assert(n < size && "index in bounds"); return detail::extract_pair(lhs, rhs, n, std::make_index_sequence()); } /****************** * bitwise_rshift * ******************/ template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, int n, requires_arch) noexcept { return bitwise_rshift(lhs, n, neon {}); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { // Blindly converting to signed since out of bounds shifts are UB anyways assert(detail::shifts_all_positive(rhs)); return vshlq_u64(lhs, vnegq_s64(vreinterpretq_s64_u64(rhs))); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, int n, requires_arch) noexcept { return bitwise_rshift(lhs, n, neon {}); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return vshlq_s64(lhs, vnegq_s64(rhs)); } /**************** * bitwise_cast * ****************/ namespace wrap { // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float64x2_t x_vreinterpretq(uint8x16_t a) noexcept { return vreinterpretq_f64_u8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float64x2_t x_vreinterpretq(int8x16_t a) noexcept { return vreinterpretq_f64_s8(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float64x2_t x_vreinterpretq(uint16x8_t a) noexcept { return vreinterpretq_f64_u16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float64x2_t x_vreinterpretq(int16x8_t a) noexcept { return vreinterpretq_f64_s16(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float64x2_t x_vreinterpretq(uint32x4_t a) noexcept { return vreinterpretq_f64_u32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float64x2_t x_vreinterpretq(int32x4_t a) noexcept { return vreinterpretq_f64_s32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float64x2_t x_vreinterpretq(uint64x2_t a) noexcept { return vreinterpretq_f64_u64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float64x2_t x_vreinterpretq(int64x2_t a) noexcept { return vreinterpretq_f64_s64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float64x2_t x_vreinterpretq(float32x4_t a) noexcept { return vreinterpretq_f64_f32(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float64x2_t x_vreinterpretq(float64x2_t a) noexcept { return a; } // TODO(c++17): Make a single function with if constexpr switch // Templating on the scalar type `T` is required because in some compilers (e.g. MSVC) // the vector types are all aliases of the same type. template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint8x16_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_u8_f64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int8x16_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_s8_f64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint16x8_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_u16_f64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int16x8_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_s16_f64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint32x4_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_u32_f64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int32x4_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_s32_f64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE uint64x2_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_u64_f64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE int64x2_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_s64_f64(a); } template ::value && std::is_same::value, int> = 0> XSIMD_INLINE float32x4_t x_vreinterpretq(float64x2_t a) noexcept { return vreinterpretq_f32_f64(a); } } template XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { using register_type = typename batch::register_type; return wrap::x_vreinterpretq>(register_type(arg)); } template XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { using src_register_type = typename batch::register_type; return wrap::x_vreinterpretq, double>(src_register_type(arg)); } template XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return arg; } /********* * isnan * *********/ template XSIMD_INLINE batch_bool isnan(batch const& arg, requires_arch) noexcept { return !(arg == arg); } /**************** * rotate_left * ****************/ template XSIMD_INLINE batch rotate_left(batch const& a, requires_arch) noexcept { return vextq_f64(a, a, N); } } template struct batch_constant; namespace kernel { /********************* * swizzle (dynamic) * *********************/ template XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { return vqtbl1q_u8(self, idx); } template XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { return vqtbl1q_s8(self, idx); } template XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { using batch_type = batch; using index_type = batch; return vreinterpretq_u16_u8(swizzle(batch_type(vreinterpretq_u8_u16(self)), index_type(vreinterpretq_u8_u16(idx * 0x0202 + 0x0100)), neon64 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), idx, neon64 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { using batch_type = batch; using index_type = batch; return vreinterpretq_u32_u8(swizzle(batch_type(vreinterpretq_u8_u32(self)), index_type(vreinterpretq_u8_u32(idx * 0x04040404 + 0x03020100)), neon64 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), idx, neon64 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { using batch_type = batch; using index_type = batch; return vreinterpretq_u64_u8(swizzle(batch_type(vreinterpretq_u8_u64(self)), index_type(vreinterpretq_u8_u64(idx * 0x0808080808080808ull + 0x0706050403020100ull)), neon64 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), idx, neon64 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), idx, neon64 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), idx, neon64 {})); } /******************** * swizzle (static) * ********************/ namespace detail { using ::xsimd::batch_constant; template struct index_burst_impl; template struct index_burst_impl, batch_constant, std::integer_sequence> { using type = batch_constant; }; template struct index_burst_impl, batch_constant, std::integer_sequence> { using next_input = batch_constant; using next_output = batch_constant; using type = typename index_burst_impl>::type; }; template struct index_burst; template struct index_burst, T> { static constexpr size_t mul = sizeof(Tp) / sizeof(T); using input = batch_constant; using output = batch_constant; using type = typename index_burst_impl>::type; }; template using index_burst_t = typename index_burst::type; template XSIMD_INLINE index_burst_t burst_index(B) { return index_burst_t(); } } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant idx, requires_arch) noexcept { return vqtbl1q_u8(self, idx.as_batch()); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant idx, requires_arch) noexcept { return vqtbl1q_s8(self, idx.as_batch()); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant idx, requires_arch) noexcept { using batch_type = batch; return vreinterpretq_u16_u8(swizzle(batch_type(vreinterpretq_u8_u16(self)), detail::burst_index(idx), A())); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant idx, requires_arch) noexcept { using batch_type = batch; return vreinterpretq_s16_s8(swizzle(batch_type(vreinterpretq_s8_s16(self)), detail::burst_index(idx), A())); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant idx, requires_arch) noexcept { using batch_type = batch; return vreinterpretq_u32_u8(swizzle(batch_type(vreinterpretq_u8_u32(self)), detail::burst_index(idx), A())); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant idx, requires_arch) noexcept { using batch_type = batch; return vreinterpretq_s32_s8(swizzle(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index(idx), A())); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant idx, requires_arch) noexcept { using batch_type = batch; return vreinterpretq_u64_u8(swizzle(batch_type(vreinterpretq_u8_u64(self)), detail::burst_index(idx), A())); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant idx, requires_arch) noexcept { using batch_type = batch; return vreinterpretq_s64_s8(swizzle(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index(idx), A())); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant idx, requires_arch) noexcept { using batch_type = batch; return vreinterpretq_f32_u8(swizzle(batch_type(vreinterpretq_u8_f32(self)), detail::burst_index(idx), A())); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant idx, requires_arch) noexcept { using batch_type = batch; return vreinterpretq_f64_u8(swizzle(batch_type(vreinterpretq_u8_f64(self)), detail::burst_index(idx), A())); } template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch_constant idx, requires_arch) noexcept { return batch>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A())); } template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch_constant idx, requires_arch) noexcept { return batch>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A())); } /********* * widen * *********/ template XSIMD_INLINE std::array, 2> widen(batch const& x, requires_arch) noexcept { return { batch(vcvt_f64_f32(vget_low_f32(x))), batch(vcvt_high_f64_f32(x)) }; } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_rvv.hpp000066400000000000000000002164551517435117100241620ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Rivos Inc. * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_RVV_HPP #define XSIMD_RVV_HPP #include #include #include "../config/xsimd_macros.hpp" #include "../types/xsimd_batch_constant.hpp" #include "../types/xsimd_rvv_register.hpp" #include "../types/xsimd_utils.hpp" #include "../utils/xsimd_type_traits.hpp" #include "./xsimd_constants.hpp" // This set of macros allows the synthesis of identifiers using a template and // variable macro arguments. A single template can then be used by multiple // macros, or multiple instances of a macro to define the same logic for // different data types. // // First some logic to paste text together... // #define XSIMD_RVV_PREFIX_T(T, S, then) XSIMD_CONCAT(T, then) #define XSIMD_RVV_PREFIX_S(T, S, then) XSIMD_CONCAT(S, then) #define XSIMD_RVV_PREFIX_M(T, S, then) XSIMD_CONCAT(m1, then) #define XSIMD_RVV_PREFIX(T, S, then) then // // XSIMD_RVV_IDENTIFIER accepts type and size parameters, and a template for // the identifier. The template is a comma-separated list of alternating // literal and parameter segments. Each parameter is appended to XSIMD_RVV_PREFIX to // form a new macro name which decides which parameter should be inserted. // Then a literal segment is inserted after that. Empty literals are used to // join two or more variables together. // #define XSIMD_RVV_IDENTIFIER9(T, S, t, ...) t #define XSIMD_RVV_IDENTIFIER8(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER9(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER7(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER8(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER6(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER7(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER5(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER6(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER4(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER5(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER3(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER4(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER2(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER3(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER1(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER2(T, S, __VA_ARGS__))) #define XSIMD_RVV_IDENTIFIER0(T, S, t, p, ...) XSIMD_CONCAT(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER1(T, S, __VA_ARGS__))) // // UNBRACKET and REPARSE force the preprocessor to handle expansion in a // specific order. XSIMD_RVV_UNBRACKET strips the parentheses from the template // (which were necessary to keep the template as a single, named macro // parameter up to this point). XSIMD_RVV_ARG_LIST then forms the new parameter list // to pass to XSIMD_RVV_IDENTIFIER0, with trailing commas to ensure the unrolled // XSIMD_RVV_IDENTIFIER loop runs to completion adding empty strings. // // However XSIMD_RVV_IDENTIFIER0 is not expanded immediately because it does not // match a function-like macro in this pass. XSIMD_RVV_REPARSE forces another // evaluation after the expansion of XSIMD_RVV_ARG_LIST, where XSIMD_RVV_IDENTIFIER0 will // now match as a function-like macro, and the cycle of substitutions and // insertions can begin. // #define XSIMD_RVV_REPARSE(v) (v) #define XSIMD_RVV_UNBRACKET(...) __VA_ARGS__ #define XSIMD_RVV_ARG_LIST(T, S, name) (T, S, XSIMD_RVV_UNBRACKET name, , , , , , , , , , , , , , , , , , , , , ) #define XSIMD_RVV_IDENTIFIER(T, S, name) XSIMD_RVV_REPARSE(XSIMD_RVV_IDENTIFIER0 XSIMD_RVV_ARG_LIST(T, S, name)) // // To avoid comma-counting bugs, replace the variable references with macros // which include enough commas to keep proper phase, and then use no commas at // all in the templates. // #define XSIMD_RVV_T , _T, #define XSIMD_RVV_S , _S, #define XSIMD_RVV_M , _M, #define XSIMD_RVV_TSM XSIMD_RVV_T XSIMD_RVV_S XSIMD_RVV_M // XSIMD_RVV_OVERLOAD, below, expands to a head section, a number of body sections // (depending on which types are supported), and a tail section. Different // variants of these sections are implemented with different suffixes on the // three macro names XSIMD_RVV_WRAPPER_HEAD, XSIMD_RVV_WRAPPER, and XSIMD_RVV_WRAPPER_TAIL and // specified as an argument to XSIMD_RVV_OVERLOAD (the empty string is the default, // but still needs an extra comma to hold its place). // // The default XSIMD_RVV_WRAPPER_HEAD provides a class containing convenient names // for the function signature argument(s) to XSIMD_RVV_OVERLOAD. That signature can // also reference the template argument T, because it's a text substitution // into the template. #define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...) \ namespace NAME##_cruft \ { \ template \ struct ctx \ { \ static constexpr size_t width = XSIMD_RVV_BITS; \ static constexpr size_t vl = width / (sizeof(T) * 8); \ using vec = rvv_reg_t; \ using uvec = rvv_reg_t, width>; \ using svec = rvv_reg_t, width>; \ using fvec = rvv_reg_t, width>; \ using bvec = rvv_bool_t; \ using scalar_vec = rvv_reg_t; \ using wide_vec = rvv_reg_t; \ using narrow_vec = rvv_reg_t; \ using type = SIGNATURE; \ }; \ template \ using sig_t = typename ctx::type; \ template \ struct impl \ { \ void operator()() const noexcept {}; \ }; \ template \ using impl_t = impl>; #define XSIMD_RVV_WRAPPER_HEAD_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) // The body of the wrapper defines a functor (because partial specialisation of // functions is not legal) which forwards its arguments to the named intrinsic // with a few manipulations. In general, vector types are handled as // rvv_reg_t<> and rely on the conversion operators in that class for // compatibility with the intrinsics. // // The function signature is not mentioned here. Instead it's provided in the // tail code as the template argument for which this is a specialisation, which // overcomes the problem of converting a function signature type to an argument // list to pass to another function. // #define XSIMD_RVV_WRAPPER(KEY, CALLEE, ...) \ template \ struct impl \ { \ constexpr Ret operator()(Args... args) const noexcept \ { \ return CALLEE(args..., ctx::vl); \ }; \ }; #define XSIMD_RVV_WRAPPER_NOVL(KEY, CALLEE, ...) \ template \ struct impl \ { \ constexpr Ret operator()(Args... args) const noexcept \ { \ return CALLEE(args...); \ }; \ }; #define XSIMD_RVV_WRAPPER_DROP_1ST(KEY, CALLEE, ...) \ template \ struct impl \ { \ constexpr Ret operator()(First, Args... args) const noexcept \ { \ return CALLEE(args..., ctx::vl); \ }; \ }; #define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS(KEY, CALLEE, SIGNATURE, ...) \ template \ struct impl \ { \ constexpr Ret operator()(First, Args... args) const noexcept \ { \ return CALLEE(__VA_ARGS__, ctx::vl); \ }; \ }; #define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS_NOVL(KEY, CALLEE, SIGNATURE, ...) \ template \ struct impl \ { \ constexpr Ret operator()(First, Args... args) const noexcept \ { \ return CALLEE(__VA_ARGS__); \ }; \ }; // This part folds all the above templates down into a single functor instance // with all the different function signatures available under the one name. // Not all of the base classes necessarily contain useful code, but there's a // default implementation so that filtering them out isn't really necessary. #define XSIMD_RVV_WRAPPER_TAIL(NAME, ...) \ } /* namespace NAME##_cruft */ \ static constexpr struct : NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t, \ NAME##_cruft::impl_t \ { \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ using NAME##_cruft::impl_t::operator(); \ } NAME {}; #define XSIMD_RVV_WRAPPER_TAIL_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) #define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) // clang-format off #define XSIMD_RVV_OVERLOAD_head(my_name, variant, ...) \ XSIMD_RVV_WRAPPER_HEAD##variant(my_name, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_i(name, variant, ...) \ XSIMD_RVV_WRAPPER##variant(int8_t, XSIMD_RVV_IDENTIFIER(i, 8, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(int16_t, XSIMD_RVV_IDENTIFIER(i, 16, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(int32_t, XSIMD_RVV_IDENTIFIER(i, 32, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(int64_t, XSIMD_RVV_IDENTIFIER(i, 64, name), __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_u(name, variant, ...) \ XSIMD_RVV_WRAPPER##variant(uint8_t, XSIMD_RVV_IDENTIFIER(u, 8, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(uint16_t, XSIMD_RVV_IDENTIFIER(u, 16, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(uint32_t, XSIMD_RVV_IDENTIFIER(u, 32, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(uint64_t, XSIMD_RVV_IDENTIFIER(u, 64, name), __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_f(name, variant, ...) \ XSIMD_RVV_WRAPPER##variant(float, XSIMD_RVV_IDENTIFIER(f, 32, name), __VA_ARGS__) \ XSIMD_RVV_WRAPPER##variant(double, XSIMD_RVV_IDENTIFIER(f, 64, name), __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_tail(my_name, variant, ...) \ XSIMD_RVV_WRAPPER_TAIL##variant(my_name, __VA_ARGS__) // Use these to create function (actually functor, sorry) wrappers overloaded // for whichever types are supported. Being functors means they can't take a // template argument (until C++14), so if a type can't be deduced then a junk // value can be passed as the first argument and discarded by using the // _DROP_1ST variant, instead. // // The wrappers use the rvv_reg_t<> types for template accessibility, and // because some types (eg., vfloat64mf2_t) don't exist and need extra // abstraction to emulate. // // In many cases the intrinsic names are different for signed, unsigned, or // float variants, the macros OVERLOAD2 and OVERLOAD3 (depending on whether or // not a float variant exists) take multiple intrinsic names and bring them // together under a single overloaded identifier where they can be used within // templates. // #define XSIMD_RVV_OVERLOAD2(my_name, name_i, name_u, variant, ...) \ XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD3(my_name, name_i, name_u, name_f, variant, ...) \ XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_f(name_f, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD(my_name, name, ...) XSIMD_RVV_OVERLOAD3(my_name, name, name, name, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_INTS(my_name, name, ...) XSIMD_RVV_OVERLOAD2(my_name, name, name, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_SINTS(my_name, name, variant, ...) \ XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_i(name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_UINTS(my_name, name, variant, ...) \ XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_u(name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) #define XSIMD_RVV_OVERLOAD_FLOATS(my_name, name, variant, ...) \ XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_f(name, variant, __VA_ARGS__) \ XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) // clang-format on namespace xsimd { template struct batch_constant; namespace kernel { namespace detail_rvv { template using rvv_reg_t = types::detail::rvv_reg_t; template using rvv_bool_t = types::detail::rvv_bool_t; template struct as_float_relaxed { using type = xsimd::sized_fp_t; }; template <> struct as_float_relaxed<1> { using type = int8_t; }; template <> struct as_float_relaxed<2> { using type = int16_t; }; template using as_float_relaxed_t = typename as_float_relaxed::type; template XSIMD_INLINE rvv_reg_t rvvreinterpret(U const& arg) noexcept { return rvv_reg_t(arg, types::detail::XSIMD_RVV_BITCAST); } template XSIMD_INLINE rvv_reg_t rvvreinterpret(batch const& arg) noexcept { typename batch::register_type r = arg; return rvvreinterpret(r); } template > XSIMD_INLINE batch rvv_to_unsigned_batch(batch const& arg) noexcept { return rvvreinterpret(arg.data); } XSIMD_RVV_OVERLOAD(rvvid, (__riscv_vid_v_u XSIMD_RVV_S XSIMD_RVV_M), _DROP_1ST, uvec(T)) XSIMD_RVV_OVERLOAD3(rvvmv_splat, (__riscv_vmv_v_x_ XSIMD_RVV_TSM), (__riscv_vmv_v_x_ XSIMD_RVV_TSM), (__riscv_vfmv_v_f_ XSIMD_RVV_TSM), , vec(T)) XSIMD_RVV_OVERLOAD3(rvvmv_lane0, (__riscv_vmv_x), (__riscv_vmv_x), (__riscv_vfmv_f), _NOVL, T(vec)) XSIMD_RVV_OVERLOAD(rvvmerge, (__riscv_vmerge), , vec(vec, vec, bvec)) XSIMD_RVV_OVERLOAD3(rvvmerge_splat, (__riscv_vmerge), (__riscv_vmerge), (__riscv_vfmerge), , vec(vec, T, bvec)) // count active lanes in a predicate XSIMD_RVV_OVERLOAD(rvvcpop, (__riscv_vcpop), , size_t(bvec)); template XSIMD_INLINE rvv_bool_t pmask8(uint8_t mask) noexcept { return rvv_bool_t(mask); } template XSIMD_INLINE rvv_bool_t pmask(uint64_t mask) noexcept { return rvv_bool_t(mask); } template XSIMD_INLINE rvv_reg_t vindex() noexcept { auto index = rvvid(T {}); if (shift < 0) index = __riscv_vsrl(index, -shift, batch::size); else index = __riscv_vsll(index, shift, batch::size); return __riscv_vadd(index, T(offset), batch::size); } } // namespace detail /******************** * Scalar to vector * ********************/ namespace detail_rvv { template XSIMD_INLINE rvv_reg_t broadcast(T arg) noexcept { // A bit of a dance, here, because rvvmv_splat has no other // argument from which to deduce type, and T=char is not // supported. map_to_sized_type_t arg_not_char(arg); const auto splat = rvvmv_splat(arg_not_char); return rvv_reg_t(splat.get_bytes(), types::detail::XSIMD_RVV_BITCAST); } } // broadcast template XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept { return detail_rvv::broadcast(arg); } /********* * Load * *********/ namespace detail_rvv { XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*)) XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec)) } template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { return detail_rvv::rvvle(reinterpret_cast const*>(src)); } template = 0> XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept { return load_aligned(src, convert(), rvv {}); } // load_complex namespace detail_rvv { template = types::detail::rvv_width_m1, int> = 0> XSIMD_INLINE rvv_reg_t rvvabut(rvv_reg_t const& lo, rvv_reg_t const& hi) noexcept { typename rvv_reg_t::register_type tmp; tmp = __riscv_vset(tmp, 0, lo); return __riscv_vset(tmp, 1, hi); } template = 0> XSIMD_INLINE rvv_reg_t rvvabut(rvv_reg_t const& lo, rvv_reg_t const& hi) noexcept { return __riscv_vslideup(lo, hi, lo.vl, lo.vl * 2); } XSIMD_RVV_OVERLOAD(rvvget_lo_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 0) XSIMD_RVV_OVERLOAD(rvvget_hi_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 1) template = types::detail::rvv_width_m1, int> = 0> XSIMD_INLINE rvv_reg_t rvvget_lo(rvv_reg_t const& vv) noexcept { typename rvv_reg_t::register_type tmp = rvvget_lo_(T {}, vv); return tmp; } template = types::detail::rvv_width_m1, int> = 0> XSIMD_INLINE rvv_reg_t rvvget_hi(rvv_reg_t const& vv) noexcept { typename rvv_reg_t::register_type tmp = rvvget_hi_(T {}, vv); return tmp; } template = 0> XSIMD_INLINE rvv_reg_t rvvget_lo(rvv_reg_t const& vv) noexcept { typename rvv_reg_t::register_type tmp = vv; return tmp; } template = 0> XSIMD_INLINE rvv_reg_t rvvget_hi(rvv_reg_t const& vv) noexcept { return __riscv_vslidedown(vv, vv.vl / 2, vv.vl); } } // Must be in detail::load_complex for use by common memory. // ODR violation are prevented because the size of the register is encoded // in batch. namespace detail { template = 0> XSIMD_INLINE batch, A> load_complex(batch const& lo, batch const& hi, requires_arch) noexcept { const auto real_index = detail_rvv::vindex, 0, 1>(); const auto imag_index = detail_rvv::vindex, 1, 1>(); const auto index = detail_rvv::rvvabut, A::width>(real_index, imag_index); const auto input = detail_rvv::rvvabut(lo.data, hi.data); const detail_rvv::rvv_reg_t result = __riscv_vrgather(input, index, index.vl); return { detail_rvv::rvvget_lo(result), detail_rvv::rvvget_hi(result) }; } } /********* * Store * *********/ template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { detail_rvv::rvvse(reinterpret_cast*>(dst), src); } template = 0> XSIMD_INLINE void store_unaligned(T* dst, batch const& src, requires_arch) noexcept { store_aligned(dst, src, rvv {}); } /****************** * scatter/gather * ******************/ namespace detail_rvv { template using rvv_enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>; XSIMD_RVV_OVERLOAD(rvvloxei, (__riscv_vloxei XSIMD_RVV_S), , vec(T const*, uvec)) XSIMD_RVV_OVERLOAD(rvvsoxei, (__riscv_vsoxei XSIMD_RVV_S), , void(T*, uvec, vec)) XSIMD_RVV_OVERLOAD3(rvvmul_splat, (__riscv_vmul), (__riscv_vmul), (__riscv_vfmul), , vec(vec, T)) } // scatter template = 0> XSIMD_INLINE void scatter(batch const& vals, T* dst, batch const& index, kernel::requires_arch) noexcept { using UU = as_unsigned_integer_t; const auto uindex = detail_rvv::rvv_to_unsigned_batch(index); auto* base = reinterpret_cast*>(dst); // or rvvsuxei const auto bi = detail_rvv::rvvmul_splat(uindex, sizeof(T)); detail_rvv::rvvsoxei(base, bi, vals); } // gather template = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { using UU = as_unsigned_integer_t; const auto uindex = detail_rvv::rvv_to_unsigned_batch(index); auto const* base = reinterpret_cast const*>(src); // or rvvluxei const auto bi = detail_rvv::rvvmul_splat(uindex, sizeof(T)); return detail_rvv::rvvloxei(base, bi); } /************** * Arithmetic * **************/ namespace detail_rvv { XSIMD_RVV_OVERLOAD3(rvvadd, (__riscv_vadd), (__riscv_vadd), (__riscv_vfadd), , vec(vec, vec)) XSIMD_RVV_OVERLOAD2(rvvsadd, (__riscv_vsadd), (__riscv_vsaddu), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvsub, (__riscv_vsub), (__riscv_vsub), (__riscv_vfsub), , vec(vec, vec)) XSIMD_RVV_OVERLOAD2(rvvssub, (__riscv_vssub), (__riscv_vssubu), , vec(vec, vec)) XSIMD_RVV_OVERLOAD2(rvvaadd, (__riscv_vaadd), (__riscv_vaaddu), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvmul, (__riscv_vmul), (__riscv_vmul), (__riscv_vfmul), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvdiv, (__riscv_vdiv), (__riscv_vdivu), (__riscv_vfdiv), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvmax, (__riscv_vmax), (__riscv_vmaxu), (__riscv_vfmax), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvmin, (__riscv_vmin), (__riscv_vminu), (__riscv_vfmin), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvneg, (__riscv_vneg), (abort), (__riscv_vfneg), , vec(vec)) XSIMD_RVV_OVERLOAD_FLOATS(rvvabs, (__riscv_vfabs), , vec(vec)) XSIMD_RVV_OVERLOAD3(rvvmacc, (__riscv_vmacc), (__riscv_vmacc), (__riscv_vfmacc), , vec(vec, vec, vec)) XSIMD_RVV_OVERLOAD3(rvvnmsac, (__riscv_vnmsac), (__riscv_vnmsac), (__riscv_vfnmsac), , vec(vec, vec, vec)) XSIMD_RVV_OVERLOAD3(rvvmadd, (__riscv_vmadd), (__riscv_vmadd), (__riscv_vfmadd), , vec(vec, vec, vec)) XSIMD_RVV_OVERLOAD3(rvvnmsub, (__riscv_vnmsub), (__riscv_vnmsub), (__riscv_vfnmsub), , vec(vec, vec, vec)) #define RISCV_VMSXX(XX) \ XSIMD_RVV_OVERLOAD3(rvvms##XX, \ (__riscv_vms##XX), \ (__riscv_vms##XX##u), \ (__riscv_vmf##XX), , bvec(vec, vec)) \ XSIMD_RVV_OVERLOAD3(rvvms##XX##_splat, \ (__riscv_vms##XX), \ (__riscv_vms##XX##u), \ (__riscv_vmf##XX), , bvec(vec, T)) #define __riscv_vmsequ __riscv_vmseq #define __riscv_vmsneu __riscv_vmsne RISCV_VMSXX(eq) RISCV_VMSXX(ne) RISCV_VMSXX(lt) RISCV_VMSXX(le) RISCV_VMSXX(gt) RISCV_VMSXX(ge) #undef __riscv_vmsequ #undef __riscv_vmsneu #undef RISCV_VMSXX } // namespace detail // add template = 0> XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvadd(lhs, rhs); } // sadd template = 0> XSIMD_INLINE batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvsadd(lhs, rhs); } // sub template = 0> XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvsub(lhs, rhs); } // ssub template = 0> XSIMD_INLINE batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvssub(lhs, rhs); } // mul template = 0> XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvmul(lhs, rhs); } // div template = 0> XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvdiv(lhs, rhs); } // max template = 0> XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvmax(lhs, rhs); } // min template = 0> XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvmin(lhs, rhs); } // neg template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { using S = as_signed_integer_t; const auto as_signed = detail_rvv::rvvreinterpret(arg); const auto result = detail_rvv::rvvneg(as_signed); return detail_rvv::rvvreinterpret(result); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { return detail_rvv::rvvneg(arg); } // abs template = 0> XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept { return arg; } template = 0> XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept { return detail_rvv::rvvabs(arg); } // fma: x * y + z template = 0> XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { // also detail_rvv::rvvmadd(x, y, z); return detail_rvv::rvvmacc(z, x, y); } // fnma: z - x * y template = 0> XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { // also detail_rvv::rvvnmsub(x, y, z); return detail_rvv::rvvnmsac(z, x, y); } // fms: x * y - z template = 0> XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { // also vfmsac(z, x, y), but lacking integer version // also vfmsub(x, y, z), but lacking integer version return -fnma(x, y, z); } // fnms: - x * y - z template = 0> XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { // also vfnmacc(z, x, y), but lacking integer version // also vfnmadd(x, y, z), but lacking integer version return -fma(z, x, y); } /********************** * Logical operations * **********************/ namespace detail_rvv { XSIMD_RVV_OVERLOAD_INTS(rvvand, (__riscv_vand), , vec(vec, vec)) XSIMD_RVV_OVERLOAD_INTS(rvvor, (__riscv_vor), , vec(vec, vec)) XSIMD_RVV_OVERLOAD_INTS(rvvor_splat, (__riscv_vor), , vec(vec, T)) XSIMD_RVV_OVERLOAD_INTS(rvvxor, (__riscv_vxor), , vec(vec, vec)) XSIMD_RVV_OVERLOAD_INTS(rvvnot, (__riscv_vnot), , vec(vec)) XSIMD_RVV_OVERLOAD(rvvmand, (__riscv_vmand_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) XSIMD_RVV_OVERLOAD(rvvmor, (__riscv_vmor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) XSIMD_RVV_OVERLOAD(rvvmxor, (__riscv_vmxor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) XSIMD_RVV_OVERLOAD(rvvmandn, (__riscv_vmandn_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) XSIMD_RVV_OVERLOAD(rvvmnot, (__riscv_vmnot), , bvec(bvec)) } // bitwise_and template = 0> XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvand(lhs, rhs); } template = 0> XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs); const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs); const auto result_bits = detail_rvv::rvvand(lhs_bits, rhs_bits); return detail_rvv::rvvreinterpret(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return detail_rvv::rvvmand(lhs, rhs); } // bitwise_andnot template = 0> XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto not_rhs = detail_rvv::rvvnot(rhs); return detail_rvv::rvvand(lhs, not_rhs); } template = 0> XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs); const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs); const auto not_rhs = detail_rvv::rvvnot(rhs_bits); const auto result_bits = detail_rvv::rvvand(lhs_bits, not_rhs); return detail_rvv::rvvreinterpret(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return detail_rvv::rvvmandn(lhs, rhs); } // bitwise_or template = 0> XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvor(lhs, rhs); } template = 0> XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs); const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs); const auto result_bits = detail_rvv::rvvor(lhs_bits, rhs_bits); return detail_rvv::rvvreinterpret(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return detail_rvv::rvvmor(lhs, rhs); } // bitwise_xor template = 0> XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvxor(lhs, rhs); } template = 0> XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = detail_rvv::rvv_to_unsigned_batch(lhs); const auto rhs_bits = detail_rvv::rvv_to_unsigned_batch(rhs); const auto result_bits = detail_rvv::rvvxor(lhs_bits, rhs_bits); return detail_rvv::rvvreinterpret(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return detail_rvv::rvvmxor(lhs, rhs); } // bitwise_not template = 0> XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { return detail_rvv::rvvnot(arg); } template = 0> XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { const auto arg_bits = detail_rvv::rvv_to_unsigned_batch(arg); const auto result_bits = detail_rvv::rvvnot(arg_bits); return detail_rvv::rvvreinterpret(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept { return detail_rvv::rvvmnot(arg); } /********** * Shifts * **********/ namespace detail_rvv { XSIMD_RVV_OVERLOAD_INTS(rvvsll_splat, (__riscv_vsll), , vec(vec, size_t)) XSIMD_RVV_OVERLOAD_INTS(rvvsll, (__riscv_vsll), , vec(vec, uvec)) XSIMD_RVV_OVERLOAD2(rvvsr_splat, (__riscv_vsra), (__riscv_vsrl), , vec(vec, size_t)) XSIMD_RVV_OVERLOAD2(rvvsr, (__riscv_vsra), (__riscv_vsrl), , vec(vec, uvec)) } // namespace detail // bitwise_lshift template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& arg, int n, requires_arch) noexcept { constexpr size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); return detail_rvv::rvvsll_splat(arg, n); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvsll(lhs, detail_rvv::rvv_to_unsigned_batch(rhs)); } // bitwise_rshift template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept { constexpr size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); return detail_rvv::rvvsr_splat(arg, n); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvsr(lhs, detail_rvv::rvv_to_unsigned_batch(rhs)); } /************** * Reductions * **************/ namespace detail_rvv { XSIMD_RVV_OVERLOAD3(rvvredsum, (__riscv_vredsum), (__riscv_vredsum), (__riscv_vfredosum), // or __riscv_vfredusum , scalar_vec(vec, scalar_vec)) XSIMD_RVV_OVERLOAD3(rvvredmax, (__riscv_vredmax), (__riscv_vredmaxu), (__riscv_vfredmax), , scalar_vec(vec, scalar_vec)) XSIMD_RVV_OVERLOAD3(rvvredmin, (__riscv_vredmin), (__riscv_vredminu), (__riscv_vfredmin), , scalar_vec(vec, scalar_vec)) XSIMD_RVV_OVERLOAD3(rvvslide1up, (__riscv_vslide1up), (__riscv_vslide1up), (__riscv_vfslide1up), , vec(vec, vec)) XSIMD_RVV_OVERLOAD3(rvvslide1down, (__riscv_vslide1down), (__riscv_vslide1down), (__riscv_vfslide1down), , vec(vec, T)) template XSIMD_INLINE T reduce_scalar(rvv_reg_t const& arg) { return detail_rvv::rvvmv_lane0(rvv_reg_t(arg.get_bytes(), types::detail::XSIMD_RVV_BITCAST)); } } // reduce_add template ::value_type, detail::enable_arithmetic_t = 0> XSIMD_INLINE V reduce_add(batch const& arg, requires_arch) noexcept { const auto zero = detail_rvv::broadcast(T(0)); const auto r = detail_rvv::rvvredsum(arg, zero); return detail_rvv::reduce_scalar(r); } // reduce_max template = 0> XSIMD_INLINE T reduce_max(batch const& arg, requires_arch) noexcept { const auto lowest = detail_rvv::broadcast(std::numeric_limits::lowest()); const auto r = detail_rvv::rvvredmax(arg, lowest); return detail_rvv::reduce_scalar(r); } // reduce_min template = 0> XSIMD_INLINE T reduce_min(batch const& arg, requires_arch) noexcept { const auto max = detail_rvv::broadcast(std::numeric_limits::max()); const auto r = detail_rvv::rvvredmin(arg, max); return detail_rvv::reduce_scalar(r); } // haddp template = 0> XSIMD_INLINE batch haddp(const batch* row, requires_arch) noexcept { constexpr std::size_t size = batch::size; T sums[size]; #pragma unroll size for (std::size_t i = 0; i < size; ++i) { sums[i] = reduce_add(row[i], rvv {}); } return load_aligned(sums, convert(), rvv {}); } /*************** * Comparisons * ***************/ // eq template = 0> XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvmseq(lhs, rhs); } template = 0> XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { const auto neq_result = detail_rvv::rvvmxor(lhs, rhs); return detail_rvv::rvvmnot(neq_result); } // neq template = 0> XSIMD_INLINE batch_bool neq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvmsne(lhs, rhs); } template = 0> XSIMD_INLINE batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return detail_rvv::rvvmxor(lhs, rhs); } // lt template = 0> XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvmslt(lhs, rhs); } // le template = 0> XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvmsle(lhs, rhs); } // gt template = 0> XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvmsgt(lhs, rhs); } // ge template = 0> XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { return detail_rvv::rvvmsge(lhs, rhs); } /************* * Selection * *************/ namespace detail_rvv { XSIMD_RVV_OVERLOAD(rvvcompress, (__riscv_vcompress_tu), , vec(vec, vec, bvec)) } // compress template XSIMD_INLINE batch compress(batch const& x, batch_bool const& mask, requires_arch) noexcept { auto zero = broadcast(T(0), rvv {}); return detail_rvv::rvvcompress(zero, x, mask); } /*************** * Permutation * ***************/ namespace detail_rvv { XSIMD_RVV_OVERLOAD(rvvrgather, (__riscv_vrgather), , vec(vec, uvec)) XSIMD_RVV_OVERLOAD(rvvslideup, (__riscv_vslideup), , vec(vec, vec, size_t)) XSIMD_RVV_OVERLOAD(rvvslidedown, (__riscv_vslidedown), , vec(vec, size_t)) } // swizzle template XSIMD_INLINE batch swizzle(batch const& arg, batch_constant, requires_arch) noexcept { static_assert(batch::size == sizeof...(idx), "invalid swizzle indices"); const batch indices { idx... }; return detail_rvv::rvvrgather(arg, indices); } template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch_constant, requires_arch) noexcept { const auto real = swizzle(self.real(), batch_constant {}, rvv {}); const auto imag = swizzle(self.imag(), batch_constant {}, rvv {}); return batch>(real, imag); } /************* * Selection * *************/ // extract_pair template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, size_t n, requires_arch) noexcept { const auto tmp = detail_rvv::rvvslidedown(rhs, n); return detail_rvv::rvvslideup(tmp, lhs, lhs.size - n); } // select template = 0> XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { return detail_rvv::rvvmerge(b, a, cond); } template XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { b... }, true_br, false_br, rvv {}); } // zip_lo template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto index = detail_rvv::vindex, 0, -1>(); const auto mask = detail_rvv::pmask8(0xaa); return detail_rvv::rvvmerge(detail_rvv::rvvrgather(lhs, index), detail_rvv::rvvrgather(rhs, index), mask); } // zip_hi template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto index = detail_rvv::vindex, batch::size / 2, -1>(); const auto mask = detail_rvv::pmask8(0xaa); return detail_rvv::rvvmerge(detail_rvv::rvvrgather(lhs, index), detail_rvv::rvvrgather(rhs, index), mask); } // store_complex template = 0> XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { const auto lo = zip_lo(src.real(), src.imag()); const auto hi = zip_hi(src.real(), src.imag()); T* buf = reinterpret_cast(dst); store_aligned(buf, lo, rvv {}); store_aligned(buf + lo.size, hi, rvv {}); } template = 0> XSIMD_INLINE void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { store_complex_aligned(dst, src, rvv {}); } /***************************** * Floating-point arithmetic * *****************************/ namespace detail_rvv { XSIMD_RVV_OVERLOAD_FLOATS(rvvfsqrt, (__riscv_vfsqrt), , vec(vec)) XSIMD_RVV_OVERLOAD_FLOATS(rvvfrec7, (__riscv_vfrec7), , vec(vec)) XSIMD_RVV_OVERLOAD_FLOATS(rvvfrsqrt7, (__riscv_vfrsqrt7), , vec(vec)) } // rsqrt template = 0> XSIMD_INLINE batch rsqrt(batch const& arg, requires_arch) noexcept { auto approx = detail_rvv::rvvfrsqrt7(arg); approx = approx * (1.5 - (0.5 * arg * approx * approx)); return approx; } // sqrt template = 0> XSIMD_INLINE batch sqrt(batch const& arg, requires_arch) noexcept { return detail_rvv::rvvfsqrt(arg); } // reciprocal template = 0> XSIMD_INLINE batch reciprocal(const batch& arg, requires_arch) noexcept { return detail_rvv::rvvfrec7(arg); } /****************************** * Floating-point conversions * ******************************/ // fast_cast namespace detail_rvv { XSIMD_RVV_OVERLOAD2(rvvfcvt_rtz, // truncating conversion, like C. (__riscv_vfcvt_rtz_x), (__riscv_vfcvt_rtz_xu), _DROP_1ST, vec(T, fvec)) XSIMD_RVV_OVERLOAD2(rvvfcvt_rne, // round to nearest, ties to even (__riscv_vfcvt_x), (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RNE) XSIMD_RVV_OVERLOAD2(rvvfcvt_rmm, // round to nearest, ties to max magnitude (__riscv_vfcvt_x), (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RMM) XSIMD_RVV_OVERLOAD2(rvvfcvt, // round to current rounding mode. (__riscv_vfcvt_x), (__riscv_vfcvt_xu), _DROP_1ST, vec(T, fvec)) XSIMD_RVV_OVERLOAD_INTS(rvvfcvt_f, (__riscv_vfcvt_f), , fvec(vec)) template using rvv_enable_ftoi_t = std::enable_if_t<(sizeof(T) == sizeof(U) && std::is_floating_point::value && !std::is_floating_point::value), int>; template using rvv_enable_itof_t = std::enable_if_t<(sizeof(T) == sizeof(U) && !std::is_floating_point::value && std::is_floating_point::value), int>; template = 0> XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { return rvvfcvt_rtz(U {}, arg); } template = 0> XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { return rvvfcvt_f(arg); } } /********* * Miscs * *********/ // set template XSIMD_INLINE batch set(batch const&, requires_arch, Args... args) noexcept { const std::array::size> tmp { args... }; return load_unaligned(tmp.data(), convert(), rvv {}); } template XSIMD_INLINE batch, A> set(batch, A> const&, requires_arch, Args... args_complex) noexcept { return batch>(set(batch {}, rvv {}, args_complex.real()...), set(batch {}, rvv {}, args_complex.imag()...)); } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { using U = as_unsigned_integer_t; const auto values = set(batch {}, rvv {}, static_cast(args)...); const auto zero = broadcast(U(0), rvv {}); detail_rvv::rvv_bool_t result = detail_rvv::rvvmsne(values, zero); return result; } // first template = 0> XSIMD_INLINE T first(batch const& arg, requires_arch) noexcept { return detail_rvv::rvvmv_lane0(arg); } template = 0> XSIMD_INLINE std::complex first(batch, A> const& arg, requires_arch) noexcept { return std::complex { detail_rvv::rvvmv_lane0(arg.real()), detail_rvv::rvvmv_lane0(arg.imag()) }; } // insert template = 0> XSIMD_INLINE batch insert(batch const& arg, T val, index, requires_arch) noexcept { const auto mask = detail_rvv::pmask(uint64_t(1) << I); return detail_rvv::rvvmerge_splat(arg, val, mask); } // get template = 0> XSIMD_INLINE T get(batch const& arg, size_t i, requires_arch) noexcept { const auto tmp = detail_rvv::rvvslidedown(arg, i); return detail_rvv::rvvmv_lane0(tmp); } template = 0> XSIMD_INLINE std::complex get(batch, A> const& arg, size_t i, requires_arch) noexcept { const auto tmpr = detail_rvv::rvvslidedown(arg.real(), i); const auto tmpi = detail_rvv::rvvslidedown(arg.imag(), i); return std::complex { detail_rvv::rvvmv_lane0(tmpr), detail_rvv::rvvmv_lane0(tmpi) }; } // get (compile-time index): skip the slidedown when I == 0; lane 0 maps straight to the scalar move. template = 0> XSIMD_INLINE T get(batch const& arg, index, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(I == 0) { return detail_rvv::rvvmv_lane0(arg); } return get(arg, I, rvv {}); } template = 0> XSIMD_INLINE std::complex get(batch, A> const& arg, index, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(I == 0) { return std::complex { detail_rvv::rvvmv_lane0(arg.real()), detail_rvv::rvvmv_lane0(arg.imag()) }; } return get(arg, I, rvv {}); } // all template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { return detail_rvv::rvvcpop(arg) == batch_bool::size; } // any template = 0> XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept { return detail_rvv::rvvcpop(arg) > 0; } // bitwise_cast template = 0, detail::enable_arithmetic_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return detail_rvv::rvv_reg_t(arg.data.get_bytes(), types::detail::XSIMD_RVV_BITCAST); } // batch_bool_cast template = 0> XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& arg, batch_bool const&, requires_arch) noexcept { using intermediate_t = typename detail_rvv::rvv_bool_t; return intermediate_t(arg.data); } // from_bool template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { const auto zero = broadcast(T(0), rvv {}); return detail_rvv::rvvmerge_splat(zero, T(1), arg); } namespace detail_rvv { template XSIMD_INLINE vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) { return __riscv_vslidedown(arg, i, types::detail::rvv_width_m1 / 8); } template <> XSIMD_INLINE vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) { const auto bytes = __riscv_vlmul_trunc_u8mf2(arg); const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf2 / 8); return __riscv_vlmul_ext_u8m1(result); } template <> XSIMD_INLINE vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) { const auto bytes = __riscv_vlmul_trunc_u8mf4(arg); const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf4 / 8); return __riscv_vlmul_ext_u8m1(result); } template <> XSIMD_INLINE vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) { const auto bytes = __riscv_vlmul_trunc_u8mf8(arg); const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf8 / 8); return __riscv_vlmul_ext_u8m1(result); } } // slide_left template = 0> XSIMD_INLINE batch slide_left(batch const& arg, requires_arch) noexcept { const auto zero = broadcast(uint8_t(0), rvv {}); const auto bytes = arg.data.get_bytes(); return detail_rvv::rvvreinterpret(detail_rvv::rvvslideup(zero, bytes, N)); } // slide_right template = 0> XSIMD_INLINE batch slide_right(batch const& arg, requires_arch) noexcept { using reg_t = detail_rvv::rvv_reg_t; const auto bytes = arg.data.get_bytes(); return reg_t(detail_rvv::rvvslidedownbytes(bytes, N), types::detail::XSIMD_RVV_BITCAST); } // isnan template = 0> XSIMD_INLINE batch_bool isnan(batch const& arg, requires_arch) noexcept { return !(arg == arg); } namespace detail_rvv { template using rvv_as_signed_integer_t = as_signed_integer_t>; template > XSIMD_INLINE batch rvvfcvt_default(batch const& arg) noexcept { return rvvfcvt_rne(U {}, arg); } template > XSIMD_INLINE batch rvvfcvt_afz(batch const& arg) noexcept { return rvvfcvt_rmm(U {}, arg); } } // nearbyint_as_int template > XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept { // Reference rounds ties to nearest even return detail_rvv::rvvfcvt_default(arg); } // round template = 0> XSIMD_INLINE batch round(batch const& arg, requires_arch) noexcept { // Round ties away from zero. const auto mask = abs(arg) < constants::maxflint>(); return select(mask, to_float(detail_rvv::rvvfcvt_afz(arg)), arg, rvv {}); } // nearbyint template = 0> XSIMD_INLINE batch nearbyint(batch const& arg, requires_arch) noexcept { // Round according to current rounding mode. const auto mask = abs(arg) < constants::maxflint>(); return select(mask, to_float(detail_rvv::rvvfcvt_default(arg)), arg, rvv {}); } // mask template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept; template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR((8 * sizeof(T)) >= batch_bool::size) { // (A) Easy case: the number of slots fits in T. const auto zero = detail_rvv::broadcast, types::detail::rvv_width_m1>(T(0)); auto ones = detail_rvv::broadcast, A::width>(1); auto iota = detail_rvv::rvvid(as_unsigned_integer_t {}); auto upowers = detail_rvv::rvvsll(ones, iota); auto r = __riscv_vredor(self.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool::size); return detail_rvv::reduce_scalar>(r); } else XSIMD_IF_CONSTEXPR((2 * 8 * sizeof(T)) == batch_bool::size) { // (B) We need two rounds, one for the low part, one for the high part. struct LowerHalf { static constexpr bool get(unsigned i, unsigned n) { return i < n / 2; } }; struct UpperHalf { static constexpr bool get(unsigned i, unsigned n) { return i >= n / 2; } }; // The low part is similar to the approach in (A). const auto zero = detail_rvv::broadcast, types::detail::rvv_width_m1>(T(0)); auto ones = detail_rvv::broadcast, A::width>(1); auto iota = detail_rvv::rvvid(as_unsigned_integer_t {}); auto upowers = detail_rvv::rvvsll(ones, iota); auto low_mask = self & make_batch_bool_constant(); auto r_low = __riscv_vredor(low_mask.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool::size); // The high part requires to slide the upower filter to match the high mask. upowers = detail_rvv::rvvslideup(upowers, upowers, 8 * sizeof(T)); auto high_mask = self & make_batch_bool_constant(); auto r_high = __riscv_vredor(high_mask.data.as_mask(), upowers, (typename decltype(zero)::register_type)zero, batch_bool::size); // Concatenate the two parts. return (uint64_t)detail_rvv::reduce_scalar>(r_low) | ((uint64_t)detail_rvv::reduce_scalar>(r_high) << (8 * sizeof(T))); } else { // (C) we could generalize (B) but we already cover a lot of case now. return mask(self, common {}); } } } // namespace kernel } // namespace xsimd #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_scalar.hpp000066400000000000000000001063611517435117100246040ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SCALAR_HPP #define XSIMD_SCALAR_HPP #include #include #include #include #include #include #include #include "xsimd/config/xsimd_macros.hpp" #ifdef XSIMD_ENABLE_XTL_COMPLEX #include "xtl/xcomplex.hpp" #endif #ifdef __APPLE__ #include #endif namespace xsimd { using std::abs; using std::acos; using std::acosh; using std::arg; using std::asin; using std::asinh; using std::atan; using std::atan2; using std::atanh; using std::cbrt; using std::ceil; using std::conj; using std::copysign; using std::cos; using std::cosh; using std::erf; using std::erfc; using std::exp; using std::exp2; using std::expm1; using std::fabs; using std::fdim; using std::floor; using std::fmax; using std::fmin; using std::fmod; using std::hypot; using std::ldexp; using std::lgamma; using std::log; using std::log10; using std::log1p; using std::log2; using std::modf; using std::nearbyint; using std::nextafter; using std::norm; using std::polar; using std::proj; using std::remainder; using std::rint; using std::round; using std::sin; using std::sinh; using std::sqrt; using std::tan; using std::tanh; using std::tgamma; using std::trunc; template XSIMD_INLINE constexpr std::enable_if_t::value && std::is_signed::value, T> abs(T v) noexcept { return v < 0 ? -v : v; } template XSIMD_INLINE constexpr std::enable_if_t::value && std::is_unsigned::value, T> abs(T v) noexcept { return v; } #ifndef _WIN32 using std::isfinite; using std::isinf; using std::isnan; #else // Windows defines catch all templates template XSIMD_INLINE std::enable_if_t::value, bool> isfinite(T var) noexcept { return std::isfinite(var); } template XSIMD_INLINE std::enable_if_t::value, bool> isfinite(T var) noexcept { return isfinite(double(var)); } template XSIMD_INLINE std::enable_if_t::value, bool> isinf(T var) noexcept { return std::isinf(var); } template XSIMD_INLINE std::enable_if_t::value, bool> isinf(T var) noexcept { return isinf(double(var)); } template XSIMD_INLINE std::enable_if_t::value, bool> isnan(T var) noexcept { return std::isnan(var); } template XSIMD_INLINE std::enable_if_t::value, bool> isnan(T var) noexcept { return isnan(double(var)); } #endif template XSIMD_INLINE std::common_type_t add(T const& x, Tp const& y) noexcept { return x + y; } template XSIMD_INLINE std::common_type_t avg(T const& x, Tp const& y) noexcept { using common_type = std::common_type_t; if (std::is_floating_point::value) return (x + y) / 2; else if (std::is_unsigned::value) { return (x & y) + ((x ^ y) >> 1); } else { // Inspired by // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c auto t = (x & y) + ((x ^ y) >> 1); auto t_u = static_cast>(t); auto avg = t + (static_cast(t_u >> (8 * sizeof(T) - 1)) & (x ^ y)); return avg; } } template XSIMD_INLINE std::common_type_t avgr(T const& x, Tp const& y) noexcept { using common_type = std::common_type_t; if (std::is_floating_point::value) return avg(x, y); else { return avg(x, y) + ((x ^ y) & 1); } } template XSIMD_INLINE T incr(T const& x) noexcept { return x + T(1); } template XSIMD_INLINE T incr_if(T const& x, bool mask) noexcept { return x + T(mask ? 1 : 0); } XSIMD_INLINE bool all(bool mask) { return mask; } XSIMD_INLINE bool any(bool mask) { return mask; } XSIMD_INLINE bool none(bool mask) { return !mask; } template XSIMD_INLINE std::enable_if_t::value, T> bitwise_and(T x, T y) noexcept { return x & y; } template XSIMD_INLINE T_out bitwise_cast(T_in x) noexcept { static_assert(sizeof(T_in) == sizeof(T_out), "bitwise_cast between types of the same size"); T_out r; std::memcpy((void*)&r, (void*)&x, sizeof(T_in)); return r; } XSIMD_INLINE float bitwise_and(float x, float y) noexcept { uint32_t ix, iy; std::memcpy((void*)&ix, (void*)&x, sizeof(float)); std::memcpy((void*)&iy, (void*)&y, sizeof(float)); uint32_t ir = bitwise_and(ix, iy); float r; std::memcpy((void*)&r, (void*)&ir, sizeof(float)); return r; } XSIMD_INLINE double bitwise_and(double x, double y) noexcept { uint64_t ix, iy; std::memcpy((void*)&ix, (void*)&x, sizeof(double)); std::memcpy((void*)&iy, (void*)&y, sizeof(double)); uint64_t ir = bitwise_and(ix, iy); double r; std::memcpy((void*)&r, (void*)&ir, sizeof(double)); return r; } template XSIMD_INLINE std::enable_if_t::value && std::is_integral::value, T0> bitwise_lshift(T0 x, T1 shift) noexcept { return x << shift; } template XSIMD_INLINE std::enable_if_t::value, T> bitwise_lshift(T x) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(shift < bits, "Count must be less than the number of bits in T"); return x << shift; } template XSIMD_INLINE std::enable_if_t::value && std::is_integral::value, T0> bitwise_rshift(T0 x, T1 shift) noexcept { return x >> shift; } template XSIMD_INLINE std::enable_if_t::value, T> bitwise_rshift(T x) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(shift < bits, "Count must be less than the number of bits in T"); return x >> shift; } template XSIMD_INLINE std::enable_if_t::value, T> bitwise_not(T x) noexcept { return ~x; } XSIMD_INLINE bool bitwise_not(bool x) noexcept { return !x; } XSIMD_INLINE float bitwise_not(float x) noexcept { uint32_t ix; std::memcpy((void*)&ix, (void*)&x, sizeof(float)); uint32_t ir = bitwise_not(ix); float r; std::memcpy((void*)&r, (void*)&ir, sizeof(float)); return r; } XSIMD_INLINE double bitwise_not(double x) noexcept { uint64_t ix; std::memcpy((void*)&ix, (void*)&x, sizeof(double)); uint64_t ir = bitwise_not(ix); double r; std::memcpy((void*)&r, (void*)&ir, sizeof(double)); return r; } template XSIMD_INLINE std::enable_if_t::value, T> bitwise_andnot(T x, T y) noexcept { return bitwise_and(x, bitwise_not(y)); } template XSIMD_INLINE std::enable_if_t::value, T> bitwise_or(T x, T y) noexcept { return x | y; } XSIMD_INLINE float bitwise_or(float x, float y) noexcept { uint32_t ix, iy; std::memcpy((void*)&ix, (void*)&x, sizeof(float)); std::memcpy((void*)&iy, (void*)&y, sizeof(float)); uint32_t ir = bitwise_or(ix, iy); float r; std::memcpy((void*)&r, (void*)&ir, sizeof(float)); return r; } XSIMD_INLINE double bitwise_or(double x, double y) noexcept { uint64_t ix, iy; std::memcpy((void*)&ix, (void*)&x, sizeof(double)); std::memcpy((void*)&iy, (void*)&y, sizeof(double)); uint64_t ir = bitwise_or(ix, iy); double r; std::memcpy((void*)&r, (void*)&ir, sizeof(double)); return r; } template XSIMD_INLINE std::enable_if_t::value, T> bitwise_xor(T x, T y) noexcept { return x ^ y; } XSIMD_INLINE float bitwise_xor(float x, float y) noexcept { uint32_t ix, iy; std::memcpy((void*)&ix, (void*)&x, sizeof(float)); std::memcpy((void*)&iy, (void*)&y, sizeof(float)); uint32_t ir = bitwise_xor(ix, iy); float r; std::memcpy((void*)&r, (void*)&ir, sizeof(float)); return r; } XSIMD_INLINE double bitwise_xor(double x, double y) noexcept { uint64_t ix, iy; std::memcpy((void*)&ix, (void*)&x, sizeof(double)); std::memcpy((void*)&iy, (void*)&y, sizeof(double)); uint64_t ir = bitwise_xor(ix, iy); double r; std::memcpy((void*)&r, (void*)&ir, sizeof(double)); return r; } template XSIMD_INLINE std::common_type_t div(T const& x, Tp const& y) noexcept { return x / y; } template XSIMD_INLINE auto mod(T const& x, Tp const& y) noexcept { return x % y; } template XSIMD_INLINE std::common_type_t mul(T const& x, Tp const& y) noexcept { return x * y; } template XSIMD_INLINE T neg(T const& x) noexcept { return -x; } template XSIMD_INLINE auto pos(T const& x) noexcept { return +x; } XSIMD_INLINE float reciprocal(float const& x) noexcept { return 1.f / x; } XSIMD_INLINE double reciprocal(double const& x) noexcept { return 1. / x; } template XSIMD_INLINE std::enable_if_t::value && std::is_integral::value, T0> rotl(T0 x, T1 shift) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; return (x << shift) | (x >> (bits - shift)); } template XSIMD_INLINE std::enable_if_t::value, T> rotl(T x) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(count < bits, "Count must be less than the number of bits in T"); return (x << count) | (x >> (bits - count)); } template XSIMD_INLINE std::enable_if_t::value && std::is_integral::value, T0> rotr(T0 x, T1 shift) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; return (x >> shift) | (x << (bits - shift)); } template XSIMD_INLINE std::enable_if_t::value, T> rotr(T x) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(count < bits, "Count must be less than the number of bits in T"); return (x >> count) | (x << (bits - count)); } template XSIMD_INLINE bool isnan(std::complex var) noexcept { return std::isnan(std::real(var)) || std::isnan(std::imag(var)); } template XSIMD_INLINE bool isinf(std::complex var) noexcept { return std::isinf(std::real(var)) || std::isinf(std::imag(var)); } template XSIMD_INLINE bool isfinite(std::complex var) noexcept { return std::isfinite(std::real(var)) && std::isfinite(std::imag(var)); } #ifdef XSIMD_ENABLE_XTL_COMPLEX using xtl::abs; using xtl::acos; using xtl::acosh; using xtl::asin; using xtl::asinh; using xtl::atan; using xtl::atanh; using xtl::cos; using xtl::cosh; using xtl::exp; using xtl::log; using xtl::log10; using xtl::norm; using xtl::pow; using xtl::proj; using xtl::sin; using xtl::sinh; using xtl::sqrt; using xtl::tan; using xtl::tanh; #endif template ::value>> XSIMD_INLINE T clip(const T& val, const T& low, const T& hi) noexcept { assert(low <= hi && "ordered clipping bounds"); return low > val ? low : (hi < val ? hi : val); } template ::value>> XSIMD_INLINE bool is_flint(const T& x) noexcept { #ifdef __FAST_MATH__ return (x - std::trunc(x)) == T(0); #else return std::isnan(x - x) ? false : (x - std::trunc(x)) == T(0); #endif } template ::value>> XSIMD_INLINE bool is_even(const T& x) noexcept { return is_flint(x * T(0.5)); } template ::value>> XSIMD_INLINE bool is_odd(const T& x) noexcept { return is_even(x - 1.); } XSIMD_INLINE int32_t nearbyint_as_int(float var) noexcept { return static_cast(std::nearbyint(var)); } XSIMD_INLINE int64_t nearbyint_as_int(double var) noexcept { return static_cast(std::nearbyint(var)); } template ::value>> XSIMD_INLINE bool eq(const T& x0, const T& x1) noexcept { return x0 == x1; } template XSIMD_INLINE bool eq(const std::complex& x0, const std::complex& x1) noexcept { return x0 == x1; } template ::value>> XSIMD_INLINE bool ge(const T& x0, const T& x1) noexcept { return x0 >= x1; } template ::value>> XSIMD_INLINE bool gt(const T& x0, const T& x1) noexcept { return x0 > x1; } template ::value>> XSIMD_INLINE bool le(const T& x0, const T& x1) noexcept { return x0 <= x1; } template ::value>> XSIMD_INLINE bool lt(const T& x0, const T& x1) noexcept { return x0 < x1; } template ::value>> XSIMD_INLINE bool neq(const T& x0, const T& x1) noexcept { return x0 != x1; } template XSIMD_INLINE bool neq(const std::complex& x0, const std::complex& x1) noexcept { return !(x0 == x1); } #if defined(__APPLE__) && (MAC_OS_X_VERSION_MIN_REQUIRED > 1080) XSIMD_INLINE float exp10(const float& x) noexcept { return __exp10f(x); } XSIMD_INLINE double exp10(const double& x) noexcept { return __exp10(x); } #elif defined(__GLIBC__) XSIMD_INLINE float exp10(const float& x) noexcept { return ::exp10f(x); } XSIMD_INLINE double exp10(const double& x) noexcept { return ::exp10(x); } #elif !defined(__clang__) && defined(__GNUC__) && (__GNUC__ >= 5) XSIMD_INLINE float exp10(const float& x) noexcept { return __builtin_exp10f(x); } XSIMD_INLINE double exp10(const double& x) noexcept { return __builtin_exp10(x); } #elif defined(_WIN32) template ::value>> XSIMD_INLINE T exp10(const T& x) noexcept { // Very inefficient but other implementations give incorrect results // on Windows return std::pow(T(10), x); } #else XSIMD_INLINE float exp10(const float& x) noexcept { const float ln10 = std::log(10.f); return std::exp(ln10 * x); } XSIMD_INLINE double exp10(const double& x) noexcept { const double ln10 = std::log(10.); return std::exp(ln10 * x); } #endif template ::value>> XSIMD_INLINE auto rsqrt(const T& x) noexcept { using float_type = decltype(std::sqrt(x)); return static_cast(1) / std::sqrt(x); } namespace detail { template XSIMD_INLINE C expm1_complex_scalar_impl(const C& val) noexcept { using T = typename C::value_type; T isin = std::sin(val.imag()); T rem1 = std::expm1(val.real()); T re = rem1 + T(1.); T si = std::sin(val.imag() * T(0.5)); return std::complex(rem1 - T(2.) * re * si * si, re * isin); } } template XSIMD_INLINE std::complex expm1(const std::complex& val) noexcept { return detail::expm1_complex_scalar_impl(val); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE xtl::xcomplex expm1(const xtl::xcomplex& val) noexcept { return detail::expm1_complex_scalar_impl(val); } #endif namespace detail { template XSIMD_INLINE C log1p_complex_scalar_impl(const C& val) noexcept { using T = typename C::value_type; C u = C(1.) + val; return u == C(1.) ? val : (u.real() <= T(0.) ? log(u) : log(u) * val / (u - C(1.))); } } template XSIMD_INLINE std::complex log1p(const std::complex& val) noexcept { return detail::log1p_complex_scalar_impl(val); } template XSIMD_INLINE std::complex log2(const std::complex& val) noexcept { return log(val) / std::log(T(2)); } template ::value>> XSIMD_INLINE T sadd(const T& lhs, const T& rhs) noexcept { if (std::numeric_limits::is_signed) { if ((lhs > 0) && (rhs > std::numeric_limits::max() - lhs)) { return std::numeric_limits::max(); } else if ((lhs < 0) && (rhs < std::numeric_limits::lowest() - lhs)) { return std::numeric_limits::lowest(); } else { return lhs + rhs; } } else { if (rhs > std::numeric_limits::max() - lhs) { return std::numeric_limits::max(); } else { return lhs + rhs; } } } template ::value>> XSIMD_INLINE T ssub(const T& lhs, const T& rhs) noexcept { if (std::numeric_limits::is_signed) { return sadd(lhs, (T)-rhs); } else { if (lhs < rhs) { return std::numeric_limits::lowest(); } else { return lhs - rhs; } } } namespace detail { template XSIMD_INLINE std::enable_if_t::value, T0> ipow(const T0& x, const T1& n) noexcept { static_assert(std::is_integral::value, "second argument must be an integer"); T0 a = x; T1 b = n; bool const recip = b < 0; T0 r(static_cast(1)); while (1) { if (b & 1) { r *= a; } b /= 2; if (b == 0) { break; } a *= a; } return recip ? static_cast(1) / r : r; } } template XSIMD_INLINE std::enable_if_t::value, T0> pow(const T0& x, const T1& n) noexcept { return detail::ipow(x, n); } template ::value && std::is_floating_point::value>> XSIMD_INLINE auto pow(const T0& t0, const T1& t1) noexcept { return std::pow(t0, t1); } template XSIMD_INLINE std::enable_if_t::value, std::complex> pow(const std::complex& t0, const T1& t1) noexcept { return detail::ipow(t0, t1); } template XSIMD_INLINE std::enable_if_t::value, std::complex> pow(const std::complex& t0, const T1& t1) noexcept { return std::pow(t0, t1); } template ::value>> XSIMD_INLINE auto pow(const T0& t0, const std::complex& t1) noexcept { return std::pow(t0, t1); } template ::value>> XSIMD_INLINE T bitofsign(T const& x) noexcept { return T(x < T(0)); } XSIMD_INLINE float bitofsign(float const& x) noexcept { return float(std::signbit(x)); } XSIMD_INLINE double bitofsign(double const& x) noexcept { return double(std::signbit(x)); } XSIMD_INLINE long double bitofsign(long double const& x) noexcept { return static_cast(std::signbit(x)); } template ::value>> XSIMD_INLINE auto signbit(T const& v) noexcept { return bitofsign(v); } XSIMD_INLINE double sign(bool const& v) noexcept { return v; } template ::value>> XSIMD_INLINE T sign(const T& v) noexcept { return v < T(0) ? T(-1.) : v == T(0) ? T(0.) : T(1.); } namespace detail { template XSIMD_INLINE C sign_complex_scalar_impl(const C& v) noexcept { using value_type = typename C::value_type; if (v.real()) { return C(sign(v.real()), value_type(0)); } else { return C(sign(v.imag()), value_type(0)); } } } template XSIMD_INLINE std::complex sign(const std::complex& v) noexcept { return detail::sign_complex_scalar_impl(v); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE xtl::xcomplex sign(const xtl::xcomplex& v) noexcept { return detail::sign_complex_scalar_impl(v); } #endif XSIMD_INLINE double signnz(bool const&) noexcept { return 1; } template ::value>> XSIMD_INLINE T signnz(const T& v) noexcept { return v < T(0) ? T(-1.) : T(1.); } template XSIMD_INLINE std::common_type_t sub(T const& x, Tp const& y) noexcept { return x - y; } template XSIMD_INLINE T decr(T const& x) noexcept { return x - T(1); } template XSIMD_INLINE T decr_if(T const& x, bool mask) noexcept { return x - T(mask ? 1 : 0); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE xtl::xcomplex log2(const xtl::xcomplex& val) noexcept { return log(val) / log(T(2)); } #endif #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE xtl::xcomplex log1p(const xtl::xcomplex& val) noexcept { return detail::log1p_complex_scalar_impl(val); } #endif template ::value && std::is_scalar::value>> XSIMD_INLINE auto min(T0 const& self, T1 const& other) noexcept { return self > other ? other : self; } // numpy defines minimum operator on complex using lexical comparison template XSIMD_INLINE std::complex> min(std::complex const& self, std::complex const& other) noexcept { return (self.real() < other.real()) ? (self) : (self.real() == other.real() ? (self.imag() < other.imag() ? self : other) : other); } template ::value && std::is_scalar::value>> XSIMD_INLINE auto max(T0 const& self, T1 const& other) noexcept { return self < other ? other : self; } // numpy defines maximum operator on complex using lexical comparison template XSIMD_INLINE std::complex> max(std::complex const& self, std::complex const& other) noexcept { return (self.real() > other.real()) ? (self) : (self.real() == other.real() ? (self.imag() > other.imag() ? self : other) : other); } template XSIMD_INLINE std::enable_if_t::value, T> fma(const T& a, const T& b, const T& c) noexcept { return a * b + c; } template XSIMD_INLINE std::enable_if_t::value, T> fma(const T& a, const T& b, const T& c) noexcept { return std::fma(a, b, c); } template XSIMD_INLINE std::enable_if_t::value, T> fms(const T& a, const T& b, const T& c) noexcept { return a * b - c; } namespace detail { template XSIMD_INLINE C fma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept { return { fms(a.real(), b.real(), fms(a.imag(), b.imag(), c.real())), fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag())) }; } } template XSIMD_INLINE std::complex fma(const std::complex& a, const std::complex& b, const std::complex& c) noexcept { return detail::fma_complex_scalar_impl(a, b, c); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE xtl::xcomplex fma(const xtl::xcomplex& a, const xtl::xcomplex& b, const xtl::xcomplex& c) noexcept { return detail::fma_complex_scalar_impl(a, b, c); } #endif namespace detail { template XSIMD_INLINE C fms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept { return { fms(a.real(), b.real(), fma(a.imag(), b.imag(), c.real())), fma(a.real(), b.imag(), fms(a.imag(), b.real(), c.imag())) }; } } template XSIMD_INLINE std::complex fms(const std::complex& a, const std::complex& b, const std::complex& c) noexcept { return detail::fms_complex_scalar_impl(a, b, c); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE xtl::xcomplex fms(const xtl::xcomplex& a, const xtl::xcomplex& b, const xtl::xcomplex& c) noexcept { return detail::fms_complex_scalar_impl(a, b, c); } #endif template XSIMD_INLINE std::enable_if_t::value, T> fnma(const T& a, const T& b, const T& c) noexcept { return -(a * b) + c; } template XSIMD_INLINE std::enable_if_t::value, T> fnma(const T& a, const T& b, const T& c) noexcept { return std::fma(-a, b, c); } namespace detail { template XSIMD_INLINE C fnma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept { return { fms(a.imag(), b.imag(), fms(a.real(), b.real(), c.real())), -fma(a.real(), b.imag(), fms(a.imag(), b.real(), c.imag())) }; } } template XSIMD_INLINE std::complex fnma(const std::complex& a, const std::complex& b, const std::complex& c) noexcept { return detail::fnma_complex_scalar_impl(a, b, c); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE xtl::xcomplex fnma(const xtl::xcomplex& a, const xtl::xcomplex& b, const xtl::xcomplex& c) noexcept { return detail::fnma_complex_scalar_impl(a, b, c); } #endif template XSIMD_INLINE std::enable_if_t::value, T> fnms(const T& a, const T& b, const T& c) noexcept { return -(a * b) - c; } template XSIMD_INLINE std::enable_if_t::value, T> fnms(const T& a, const T& b, const T& c) noexcept { return -std::fma(a, b, c); } namespace detail { template XSIMD_INLINE C fnms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept { return { fms(a.imag(), b.imag(), fma(a.real(), b.real(), c.real())), -fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag())) }; } } template XSIMD_INLINE std::complex fnms(const std::complex& a, const std::complex& b, const std::complex& c) noexcept { return detail::fnms_complex_scalar_impl(a, b, c); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE xtl::xcomplex fnms(const xtl::xcomplex& a, const xtl::xcomplex& b, const xtl::xcomplex& c) noexcept { return detail::fnms_complex_scalar_impl(a, b, c); } #endif namespace detail { #define XSIMD_HASSINCOS_TRAIT(func) \ template \ struct has##func \ { \ template \ static XSIMD_INLINE auto get(T* ptr) -> decltype(func(std::declval(), std::declval(), std::declval()), std::true_type {}); \ static XSIMD_INLINE std::false_type get(...); \ static constexpr bool value = decltype(get((S*)nullptr))::value; \ } #define XSIMD_HASSINCOS(func, T) has##func::value XSIMD_HASSINCOS_TRAIT(sincos); XSIMD_HASSINCOS_TRAIT(sincosf); XSIMD_HASSINCOS_TRAIT(__sincos); XSIMD_HASSINCOS_TRAIT(__sincosf); struct common_sincosf { template XSIMD_INLINE std::enable_if_t operator()(float val, T& s, T& c) { sincosf(val, &s, &c); } template XSIMD_INLINE std::enable_if_t operator()(float val, T& s, T& c) { __sincosf(val, &s, &c); } template XSIMD_INLINE std::enable_if_t operator()(float val, T& s, T& c) { s = std::sin(val); c = std::cos(val); } }; struct common_sincos { template XSIMD_INLINE std::enable_if_t operator()(double val, T& s, T& c) { sincos(val, &s, &c); } template XSIMD_INLINE std::enable_if_t operator()(double val, T& s, T& c) { __sincos(val, &s, &c); } template XSIMD_INLINE std::enable_if_t operator()(double val, T& s, T& c) { s = std::sin(val); c = std::cos(val); } }; #undef XSIMD_HASSINCOS_TRAIT #undef XSIMD_HASSINCOS } XSIMD_INLINE std::pair sincos(float val) noexcept { float s, c; detail::common_sincosf {}(val, s, c); return std::make_pair(s, c); } XSIMD_INLINE std::pair sincos(double val) noexcept { double s, c; detail::common_sincos {}(val, s, c); return std::make_pair(s, c); } template XSIMD_INLINE std::pair, std::complex> sincos(const std::complex& val) noexcept { return std::make_pair(std::sin(val), std::cos(val)); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE std::pair, xtl::xcomplex> sincos(const xtl::xcomplex& val) noexcept { return std::make_pair(sin(val), cos(val)); } #endif template ::value>> XSIMD_INLINE T frexp(T const& val, int& exp) noexcept { return std::frexp(val, &exp); } template XSIMD_INLINE T select(bool cond, T const& true_br, T const& false_br) noexcept { return cond ? true_br : false_br; } template XSIMD_INLINE constexpr bool batch_bool_cast(bool b) noexcept { return b; } template XSIMD_INLINE constexpr T_out batch_cast(T_in const& val) noexcept { static_assert(!std::is_same::value, "cannot convert to bool, use !x or x != 0"); return static_cast(val); } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_sse2.hpp000066400000000000000000003030271517435117100242110ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE2_HPP #define XSIMD_SSE2_HPP #include #include #include #include "../types/xsimd_batch_constant.hpp" #include "../types/xsimd_sse2_register.hpp" #include "./utils/shifts.hpp" namespace xsimd { template struct batch_bool_constant; template XSIMD_INLINE batch bitwise_cast(batch const& x) noexcept; template struct batch_constant; namespace kernel { using namespace types; namespace detail { constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z) { return (z << 6) | (y << 4) | (x << 2) | w; } constexpr uint32_t shuffle(uint32_t x, uint32_t y) { return (y << 1) | x; } constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z) { return shuffle(w % 4, x % 4, y % 4, z % 4); } constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x) { return shuffle(w % 2, x % 2); } } // fwd template XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept; template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept; template XSIMD_INLINE batch avg(batch const&, batch const&, requires_arch) noexcept; template XSIMD_INLINE batch avgr(batch const&, batch const&, requires_arch) noexcept; // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31 return _mm_andnot_pd(sign_mask, self); } template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 return _mm_andnot_ps(sign_mask, self); } // add template ::value>> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_add_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_add_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_add_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_add_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm_add_ps(self, other); } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm_add_pd(self, other); } // all template XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_ps(self) == 0x0F; } template XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_pd(self) == 0x03; } template ::value>> XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_epi8(self) == 0xFFFF; } // any template XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_ps(self) != 0; } template XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_pd(self) != 0; } template ::value>> XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_epi8(self) != 0; } // avgr template ::value>> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_avg_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_avg_epu16(self, other); } else { return avgr(self, other, common {}); } } // avg template ::value>> XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto adj = ((self ^ other) << 7) >> 7; return avgr(self, other, A {}) - adj; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto adj = ((self ^ other) << 15) >> 15; return avgr(self, other, A {}) - adj; } else { return avg(self, other, common {}); } } // batch_bool_cast template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return { bitwise_cast(batch(self.data)).data }; } // bitwise_and template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm_and_ps(self, other); } template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_and_ps(self, other); } template ::value>> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm_and_si128(self, other); } template ::value>> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_and_si128(self, other); } template batch XSIMD_INLINE bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm_and_pd(self, other); } template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_and_pd(self, other); } // bitwise_andnot template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm_andnot_ps(other, self); } template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_andnot_ps(other, self); } template ::value>> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm_andnot_si128(other, self); } template ::value>> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_andnot_si128(other, self); } template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm_andnot_pd(other, self); } template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_andnot_pd(other, self); } // bitwise_lshift template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_slli_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_slli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_slli_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(shift < bits, "Count must be less than the number of bits in T"); XSIMD_IF_CONSTEXPR(shift == 0) { return self; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { // 8-bit left shift via 16-bit shift + mask __m128i shifted = _mm_slli_epi16(self, static_cast(shift)); // TODO(C++17): without `if constexpr` we must ensure the compile-time shift does not overflow constexpr uint8_t mask8 = static_cast(sizeof(T) == 1 ? (~0u << shift) : 0); const __m128i mask = _mm_set1_epi8(mask8); return _mm_and_si128(shifted, mask); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_slli_epi16(self, static_cast(shift)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_slli_epi32(self, static_cast(shift)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_slli_epi64(self, static_cast(shift)); } return bitwise_lshift(self, common {}); } // bitwise_lshift multiple (constant) // Missing implementations are dispacthed to the `batch` overload in xsimd_api. template = 0> XSIMD_INLINE batch bitwise_lshift( batch const& self, batch_constant shifts, requires_arch req) noexcept { XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) { return bitwise_lshift(self, req); } constexpr auto mults = batch_constant(1u << Vs)...>(); return _mm_mullo_epi16(self, mults.as_batch()); } template = 0> XSIMD_INLINE batch bitwise_lshift( batch const& self, batch_constant shifts, requires_arch req) noexcept { using uint_t = std::make_unsigned_t; XSIMD_IF_CONSTEXPR(utils::all_equals(shifts)) { return bitwise_lshift(self, req); } return bitwise_cast( utils::bitwise_lshift_as_twice_larger( bitwise_cast(self), batch_constant(Vs)...> {})); } // bitwise_not template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); } template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); } template ::value>> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm_xor_si128(self, _mm_set1_epi32(-1)); } template ::value>> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm_xor_si128(self, _mm_set1_epi32(-1)); } template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); } template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); } // bitwise_or template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm_or_ps(self, other); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_or_ps(self, other); } template ::value>> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm_or_si128(self, other); } template ::value>> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_or_si128(self, other); } template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm_or_pd(self, other); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_or_pd(self, other); } // bitwise_rshift template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF); __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self); __m128i res = _mm_srai_epi16(self, other); return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_srai_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_srai_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { // from https://github.com/samyvilar/vect/blob/master/vect_128.h return _mm_or_si128( _mm_srli_epi64(self, other), _mm_slli_epi64( _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32), 64 - other)); } else { assert(false && "unsupported arch/op combination"); return {}; } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_srli_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_srli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_srli_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } } template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, requires_arch) noexcept { constexpr auto bits = std::numeric_limits::digits + std::numeric_limits::is_signed; static_assert(shift < bits, "Shift must be less than the number of value bits in the type"); XSIMD_IF_CONSTEXPR(shift == 0) { return self; } XSIMD_IF_CONSTEXPR(std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { // 8-bit arithmetic right shift via 16-bit shift + sign-extension handling. __m128i shifted = _mm_srai_epi16(self, static_cast(shift)); __m128i sign_mask = _mm_set1_epi16(static_cast(0xFF00 >> shift)); __m128i cmp_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self); return _mm_or_si128(_mm_and_si128(sign_mask, cmp_negative), _mm_andnot_si128(sign_mask, shifted)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_srai_epi16(self, static_cast(shift)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_srai_epi32(self, static_cast(shift)); } // No 64-bit arithmetic right shift in SSE2; fall back return bitwise_rshift(self, common {}); } else // unsigned / logical right shift { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { // 8-bit left shift via 16-bit shift + mask __m128i shifted = _mm_srli_epi16(self, static_cast(shift)); // TODO(C++17): without `if constexpr` we must ensure the compile-time shift does not overflow constexpr uint8_t mask8 = static_cast(sizeof(T) == 1 ? ((1u << shift) - 1u) : 0); const __m128i mask = _mm_set1_epi8(mask8); return _mm_and_si128(shifted, mask); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_srli_epi16(self, static_cast(shift)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_srli_epi32(self, static_cast(shift)); } else // sizeof(T) == 8 { return _mm_srli_epi64(self, static_cast(shift)); } } } // bitwise_xor template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm_xor_ps(self, other); } template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_xor_ps(self, other); } template ::value>> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm_xor_si128(self, other); } template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm_xor_pd(self, other); } template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_xor_pd(self, other); } template ::value>> XSIMD_INLINE batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_xor_si128(self, other); } // bitwise_cast template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm_castsi128_ps(self); } template >::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return batch(self.data); } template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm_castps_si128(self); } template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm_castsi128_pd(self); } template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm_castps_pd(self); } template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm_castpd_ps(self); } template ::value>> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm_castpd_si128(self); } // broadcast template batch XSIMD_INLINE broadcast(float val, requires_arch) noexcept { return _mm_set1_ps(val); } template ::value>> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_set1_epi8(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_set1_epi16(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_set1_epi32(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_set1_epi64x(val); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch broadcast(double val, requires_arch) noexcept { return _mm_set1_pd(val); } // store_complex namespace detail { // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned // complex_low template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return _mm_unpacklo_ps(self.real(), self.imag()); } // complex_high template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return _mm_unpackhi_ps(self.real(), self.imag()); } template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return _mm_unpacklo_pd(self.real(), self.imag()); } template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return _mm_unpackhi_pd(self.real(), self.imag()); } } // decr_if template ::value>> XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self + batch(mask.data); } // div template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm_div_ps(self, other); } template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm_div_pd(self, other); } // fast_cast namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm_cvtepi32_ps(self); } template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to sse2 __m128i xH = _mm_srli_epi64(x, 32); xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84 __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)"); return _mm_add_pd(f, _mm_castsi128_pd(xL)); } template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to sse2 __m128i xH = _mm_srai_epi32(x, 16); xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67 __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)"); return _mm_add_pd(f, _mm_castsi128_pd(xL)); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm_cvttps_epi32(self); } } // eq template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpeq_ps(self, other); } template XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other))); } template ::value>> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_cmpeq_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_cmpeq_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_cmpeq_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { __m128i tmp1 = _mm_cmpeq_epi32(self, other); __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1); __m128i tmp3 = _mm_and_si128(tmp1, tmp2); __m128i tmp4 = _mm_srai_epi32(tmp3, 31); return _mm_shuffle_epi32(tmp4, 0xF5); } else { assert(false && "unsupported arch/op combination"); return {}; } } template ::value>> XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self != other); } template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpeq_pd(self, other); } template XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other))); } // first template XSIMD_INLINE float first(batch const& self, requires_arch) noexcept { return _mm_cvtss_f32(self); } template XSIMD_INLINE double first(batch const& self, requires_arch) noexcept { return _mm_cvtsd_f64(self); } template ::value>> XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return static_cast(_mm_cvtsi128_si32(self) & 0xFF); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return static_cast(_mm_cvtsi128_si32(self) & 0xFFFF); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return static_cast(_mm_cvtsi128_si32(self)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { #if defined(__x86_64__) return static_cast(_mm_cvtsi128_si64(self)); #else __m128i m; _mm_storel_epi64(&m, self); int64_t i; std::memcpy(&i, &m, sizeof(i)); return i; #endif } else { assert(false && "unsupported arch/op combination"); return {}; } } // from_mask template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint32_t lut[][4] = { { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }, { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }, { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }, { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }, { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, }; assert(!(mask & ~0xFul) && "inbound mask"); return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask])); } template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut[][4] = { { 0x0000000000000000ul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, }; assert(!(mask & ~0x3ul) && "inbound mask"); return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask])); } template ::value>> XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut64[] = { 0x0000000000000000, 0x000000000000FFFF, 0x00000000FFFF0000, 0x00000000FFFFFFFF, 0x0000FFFF00000000, 0x0000FFFF0000FFFF, 0x0000FFFFFFFF0000, 0x0000FFFFFFFFFFFF, 0xFFFF000000000000, 0xFFFF00000000FFFF, 0xFFFF0000FFFF0000, 0xFFFF0000FFFFFFFF, 0xFFFFFFFF00000000, 0xFFFFFFFF0000FFFF, 0xFFFFFFFFFFFF0000, 0xFFFFFFFFFFFFFFFF, }; alignas(A::alignment()) static const uint32_t lut32[] = { 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF, }; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(!(mask & ~0xFFFF) && "inbound mask"); return _mm_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(!(mask & ~0xFF) && "inbound mask"); return _mm_set_epi64x(lut64[mask >> 4], lut64[mask & 0xF]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_castps_si128(from_mask(batch_bool {}, mask, sse2 {})); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_castpd_si128(from_mask(batch_bool {}, mask, sse2 {})); } } // ge template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpge_ps(self, other); } template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpge_pd(self, other); } // gt template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpgt_ps(self, other); } template ::value>> XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_cmpgt_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_cmpgt_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_cmpgt_epi32(self, other); } else { return gt(self, other, common {}); } } else { return gt(self, other, common {}); } } template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpgt_pd(self, other); } // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]); tmp0 = _mm_add_ps(tmp0, tmp1); tmp1 = _mm_unpacklo_ps(row[2], row[3]); tmp1 = _mm_add_ps(tmp1, tmp2); tmp2 = _mm_movehl_ps(tmp1, tmp0); tmp0 = _mm_movelh_ps(tmp0, tmp1); return _mm_add_ps(tmp0, tmp2); } template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]), _mm_unpackhi_pd(row[0], row[1])); } // incr_if template ::value>> XSIMD_INLINE batch incr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self - batch(mask.data); } // insert template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_insert_epi16(self, val, I); } else { return insert(self, val, pos, common {}); } } // isnan template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm_cmpunord_ps(self, self); } template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm_cmpunord_pd(self, self); } // load_aligned template XSIMD_INLINE batch load_aligned(float const* mem, convert, requires_arch) noexcept { return _mm_load_ps(mem); } template ::value>> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept { return _mm_load_si128((__m128i const*)mem); } template XSIMD_INLINE batch load_aligned(double const* mem, convert, requires_arch) noexcept { return _mm_load_pd(mem); } // load_unaligned template XSIMD_INLINE batch load_unaligned(float const* mem, convert, requires_arch) noexcept { return _mm_loadu_ps(mem); } template ::value>> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return _mm_loadu_si128((__m128i const*)mem); } template XSIMD_INLINE batch load_unaligned(double const* mem, convert, requires_arch) noexcept { return _mm_loadu_pd(mem); } // load batch_bool template XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { return _mm_sub_epi8(_mm_set1_epi8(0), _mm_loadu_si128((__m128i const*)mem)); } template XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept { return { load_unaligned(mem, batch_bool {}, r).data }; } template XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept { return { load_unaligned(mem, batch_bool {}, r).data }; } // load_masked template ::value>> XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant mask, Mode, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(mask.mask() == 0x1) { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return mm_loadu_si16(mem); } XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return mm_loadu_si32(mem); } XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return mm_loadu_si64(mem); } } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2 && mask.mask() == 0x3) { return mm_loadu_si32(mem); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4 && mask.mask() == 0x3) { return mm_loadu_si64(mem); } else { return load_masked(mem, mask, convert {}, Mode {}, common {}); } } template XSIMD_INLINE batch load_masked(float const* mem, batch_bool_constant mask, Mode, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(mask.mask() == 0x1) { return _mm_load_ss(mem); } else XSIMD_IF_CONSTEXPR(mask.countr_one() == 2) { return _mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const*>(mem)); } else XSIMD_IF_CONSTEXPR(mask.countl_one() == 2) { return _mm_loadh_pi(_mm_setzero_ps(), reinterpret_cast<__m64 const*>(mem + 2)); } else { return load_masked(mem, mask, convert {}, Mode {}, common {}); } } template XSIMD_INLINE batch load_masked(double const* mem, batch_bool_constant mask, Mode, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(mask.countr_one() == 1) { return _mm_load_sd(mem); } else XSIMD_IF_CONSTEXPR(mask.countl_one() == 1) { return _mm_loadh_pd(_mm_setzero_pd(), mem + 1); } else { return load_masked(mem, mask, convert {}, Mode {}, common {}); } } // store_masked template XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(mask.mask() == 0x1) { _mm_store_ss(mem, src); } else XSIMD_IF_CONSTEXPR(mask.countr_one() == 2) { _mm_storel_pi(reinterpret_cast<__m64*>(mem), src); } else XSIMD_IF_CONSTEXPR(mask.countl_one() == 2) { _mm_storeh_pi(reinterpret_cast<__m64*>(mem + 2), src); } else { store_masked(mem, src, mask, Mode {}, common {}); } } template XSIMD_INLINE void store_masked(double* mem, batch const& src, batch_bool_constant mask, Mode, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(mask.countr_one() == 1) { _mm_store_sd(mem, src); } else XSIMD_IF_CONSTEXPR(mask.countl_one() == 1) { _mm_storeh_pd(mem + 1, src); } else { store_masked(mem, src, mask, Mode {}, common {}); } } // load_complex namespace detail { // Redefine these methods in the SSE-based archs if required template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) }; } template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) }; } } // le template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmple_ps(self, other); } template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmple_pd(self, other); } // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmplt_ps(self, other); } template ::value>> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_cmplt_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_cmplt_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_cmplt_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { __m128i tmp1 = _mm_sub_epi64(self, other); __m128i tmp2 = _mm_xor_si128(self, other); __m128i tmp3 = _mm_andnot_si128(other, self); __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); __m128i tmp5 = _mm_or_si128(tmp3, tmp4); __m128i tmp6 = _mm_srai_epi32(tmp5, 31); return _mm_shuffle_epi32(tmp6, 0xF5); } else { assert(false && "unsupported arch/op combination"); return {}; } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits::lowest()))); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits::lowest()))); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits::lowest()))); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits::lowest())); auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits::lowest())); __m128i tmp1 = _mm_sub_epi64(xself, xother); __m128i tmp2 = _mm_xor_si128(xself, xother); __m128i tmp3 = _mm_andnot_si128(xother, xself); __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); __m128i tmp5 = _mm_or_si128(tmp3, tmp4); __m128i tmp6 = _mm_srai_epi32(tmp5, 31); return _mm_shuffle_epi32(tmp6, 0xF5); } else { assert(false && "unsupported arch/op combination"); return {}; } } } template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmplt_pd(self, other); } /* compression table to turn 0b10 into 0b1, * 0b100010 into 0b101 etc */ namespace detail { XSIMD_INLINE int mask_lut(uint64_t mask) { // clang-format off static const int mask_lut[256] = { 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4, 0x0, 0x5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x8, 0x0, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0xA, 0x0, 0xB, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xC, 0x0, 0xD, 0x0, 0x0, 0x0, 0x0, 0x0, 0xE, 0x0, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, }; // clang-format on return mask_lut[mask & 0xAA]; } } // mask template ::value>> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_movemask_epi8(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { uint64_t mask8 = _mm_movemask_epi8(self); return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_movemask_ps(_mm_castsi128_ps(self)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_movemask_pd(_mm_castsi128_pd(self)); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_ps(self); } template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_pd(self); } // max template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm_max_ps(other, self); } template ::value>> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return select(self > other, self, other); } template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm_max_pd(other, self); } // min template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm_min_ps(other, self); } template ::value>> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return select(self <= other, self, other); } template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm_min_pd(other, self); } // mul template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm_mul_ps(self, other); } template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm_mul_pd(self, other); } // mul template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm_mullo_epi16(self, other); } // nearbyint_as_int template XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return _mm_cvtps_epi32(self); } // neg template ::value>> XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return 0 - self; } template XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } template XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return _mm_xor_pd( self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000))); } // neq template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpneq_ps(self, other); } template ::value>> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~(self == other); } template XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_xor_ps(self, other); } template ::value>> XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data))); } template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpneq_pd(self, other); } template XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_xor_pd(self, other); } // reciprocal template XSIMD_INLINE batch reciprocal(batch const& self, kernel::requires_arch) { return _mm_rcp_ps(self); } // reduce_add template XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept { __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self)); __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); return _mm_cvtss_f32(tmp1); } template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E); __m128i tmp2 = _mm_add_epi32(self, tmp1); __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01); __m128i tmp4 = _mm_add_epi32(tmp2, tmp3); return _mm_cvtsi128_si32(tmp4); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E); __m128i tmp2 = _mm_add_epi64(self, tmp1); #if defined(__x86_64__) return _mm_cvtsi128_si64(tmp2); #else __m128i m; _mm_storel_epi64(&m, tmp2); int64_t i; std::memcpy(&i, &m, sizeof(i)); return i; #endif } else { return reduce_add(self, common {}); } } template XSIMD_INLINE double reduce_add(batch const& self, requires_arch) noexcept { return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self))); } // reduce_max template > XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept { constexpr auto mask0 = detail::shuffle(2, 3, 0, 0); batch step0 = _mm_shuffle_epi32(self, mask0); batch acc0 = max(self, step0); constexpr auto mask1 = detail::shuffle(1, 0, 0, 0); batch step1 = _mm_shuffle_epi32(acc0, mask1); batch acc1 = max(acc0, step1); constexpr auto mask2 = detail::shuffle(1, 0, 0, 0); batch step2 = _mm_shufflelo_epi16(acc1, mask2); batch acc2 = max(acc1, step2); if (sizeof(T) == 2) return first(acc2, A {}); batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); batch acc3 = max(acc2, step3); return first(acc3, A {}); } // reduce_min template > XSIMD_INLINE T reduce_min(batch const& self, requires_arch) noexcept { constexpr auto mask0 = detail::shuffle(2, 3, 0, 0); batch step0 = _mm_shuffle_epi32(self, mask0); batch acc0 = min(self, step0); constexpr auto mask1 = detail::shuffle(1, 0, 0, 0); batch step1 = _mm_shuffle_epi32(acc0, mask1); batch acc1 = min(acc0, step1); constexpr auto mask2 = detail::shuffle(1, 0, 0, 0); batch step2 = _mm_shufflelo_epi16(acc1, mask2); batch acc2 = min(acc1, step2); if (sizeof(T) == 2) return first(acc2, A {}); batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); batch acc3 = min(acc2, step3); return first(acc3, A {}); } // reduce_mul template XSIMD_INLINE float reduce_mul(batch const& self, requires_arch) noexcept { __m128 tmp0 = _mm_mul_ps(self, _mm_movehl_ps(self, self)); __m128 tmp1 = _mm_mul_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); return _mm_cvtss_f32(tmp1); } template XSIMD_INLINE double reduce_mul(batch const& self, requires_arch) noexcept { return _mm_cvtsd_f64(_mm_mul_sd(self, _mm_unpackhi_pd(self, self))); } template ::value>> XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { batch tmp1 = _mm_shuffle_epi32(self, _MM_SHUFFLE(0, 1, 2, 3)); tmp1 = tmp1 * self; batch tmp2 = _mm_unpackhi_epi32(tmp1, tmp1); tmp2 = tmp2 * tmp1; return _mm_cvtsi128_si32(tmp2); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { batch tmp1 = _mm_unpackhi_epi64(self, self); auto tmp2 = tmp1 * self; #if defined(__x86_64__) return _mm_cvtsi128_si64(tmp2); #else __m128i m; _mm_storel_epi64(&m, tmp2); int64_t i; std::memcpy(&i, &m, sizeof(i)); return i; #endif } else { return reduce_mul(self, common {}); } } // rsqrt template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm_rsqrt_ps(val); } template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val))); } // select template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br)); } template ::value>> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br)); } template ::value>> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, sse2 {}); } template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br)); } // shuffle template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3); // shuffle within lane if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4) return _mm_shuffle_ps(x, y, smask); // shuffle within opposite lane if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4) return _mm_shuffle_ps(y, x, smask); return shuffle(x, y, mask, common {}); } template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = detail::mod_shuffle(I0, I1); // shuffle within lane if (I0 < 2 && I1 >= 2) return _mm_shuffle_pd(x, y, smask); // shuffle within opposite lane if (I0 >= 2 && I1 < 2) return _mm_shuffle_pd(y, x, smask); return shuffle(x, y, mask, common {}); } // sqrt template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm_sqrt_ps(val); } template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm_sqrt_pd(val); } // slide_left template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { return _mm_slli_si128(x, N); } // slide_right template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { return _mm_srli_si128(x, N); } // sadd template ::value>> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_adds_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_adds_epi16(self, other); } else { return sadd(self, other, common {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_adds_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_adds_epu16(self, other); } else { return sadd(self, other, common {}); } } } // set template XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return _mm_setr_ps(values...); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1) noexcept { return _mm_set_epi64x(v1, v0); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept { return _mm_setr_epi32(v0, v1, v2, v3); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept { return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept { return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return _mm_setr_pd(values...); } template ::value>> XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return _mm_castsi128_ps(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return _mm_castsi128_pd(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); } // ssub template ::value>> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_subs_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_subs_epi16(self, other); } else { return ssub(self, other, common {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_subs_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_subs_epu16(self, other); } else { return ssub(self, other, common {}); } } } // store namespace detail { template XSIMD_INLINE void store_bool_sse2(__m128i b, bool* mem, T) noexcept { // GCC <12 have missing or buggy unaligned store intrinsics; use memcpy to work around this. // GCC/Clang/MSVC will turn it into the correct store. XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { // negate mask to convert to 0 or 1 auto val = _mm_sub_epi8(_mm_set1_epi8(0), b); memcpy(mem, &val, sizeof(val)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b, b)); #if defined(__x86_64__) auto val_lo = _mm_cvtsi128_si64(val); memcpy(mem, &val_lo, sizeof(val_lo)); #else memcpy(mem, &val, sizeof(uint64_t)); #endif } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto pack_16 = _mm_packs_epi32(b, b); uint32_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16))); memcpy(mem, &val, sizeof(val)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto pack_32 = _mm_packs_epi32(b, b); auto pack_16 = _mm_packs_epi32(pack_32, pack_32); uint16_t val = _mm_cvtsi128_si32(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(pack_16, pack_16))); memcpy(mem, &val, sizeof(val)); } else { assert(false && "unsupported arch/op combination"); } } XSIMD_INLINE __m128i sse_to_i(__m128 x) { return _mm_castps_si128(x); } XSIMD_INLINE __m128i sse_to_i(__m128d x) { return _mm_castpd_si128(x); } XSIMD_INLINE __m128i sse_to_i(__m128i x) { return x; } } template XSIMD_INLINE void store(batch_bool b, bool* mem, requires_arch) noexcept { detail::store_bool_sse2(detail::sse_to_i(b), mem, T {}); } // store_aligned template XSIMD_INLINE void store_aligned(float* mem, batch const& self, requires_arch) noexcept { return _mm_store_ps(mem, self); } template ::value>> XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept { return _mm_store_si128((__m128i*)mem, self); } template ::value>> XSIMD_INLINE void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm_store_si128((__m128i*)mem, self); } template XSIMD_INLINE void store_aligned(double* mem, batch const& self, requires_arch) noexcept { return _mm_store_pd(mem, self); } // store_unaligned template XSIMD_INLINE void store_unaligned(float* mem, batch const& self, requires_arch) noexcept { return _mm_storeu_ps(mem, self); } template ::value>> XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { return _mm_storeu_si128((__m128i*)mem, self); } template ::value>> XSIMD_INLINE void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm_storeu_si128((__m128i*)mem, self); } template XSIMD_INLINE void store_unaligned(double* mem, batch const& self, requires_arch) noexcept { return _mm_storeu_pd(mem, self); } // store_stream template XSIMD_INLINE void store_stream(float* mem, batch const& self, requires_arch) noexcept { _mm_stream_ps(mem, self); } template ::value, void>> XSIMD_INLINE void store_stream(T* mem, batch const& self, requires_arch) noexcept { _mm_stream_si128((__m128i*)mem, self); } template XSIMD_INLINE void store_stream(double* mem, batch const& self, requires_arch) noexcept { _mm_stream_pd(mem, self); } // sub template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm_sub_ps(self, other); } template ::value>> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_sub_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_sub_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_sub_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_sub_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm_sub_pd(self, other); } // swizzle (constant mask) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3); return _mm_shuffle_ps(self, self, index); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr uint32_t index = detail::shuffle(V0, V1); return _mm_shuffle_pd(self, self, index); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1); return _mm_shuffle_epi32(self, index); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3); return _mm_shuffle_epi32(self, index); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { constexpr bool is_identity = detail::is_identity(mask); constexpr bool is_dup_lo = detail::is_dup_lo(mask); constexpr bool is_dup_hi = detail::is_dup_hi(mask); XSIMD_IF_CONSTEXPR(is_identity) { return self; } XSIMD_IF_CONSTEXPR(is_dup_lo) { // permute the low half constexpr int imm = detail::mod_shuffle(V0, V1, V2, V3); const auto lo = _mm_shufflelo_epi16(self, imm); // broadcast that 64-bit low half into both halves const auto lo_all = _mm_unpacklo_epi64(lo, lo); return lo_all; } XSIMD_IF_CONSTEXPR(is_dup_hi) { // permute the high half constexpr int imm = detail::mod_shuffle(V4, V5, V6, V7); const auto hi = _mm_shufflehi_epi16(self, imm); // broadcast that 64-bit high half into both halves const auto hi_all = _mm_unpackhi_epi64(hi, hi); return hi_all; } // Only pick elements from the low lane XSIMD_IF_CONSTEXPR(detail::is_only_from_lo(mask)) { // permute within each sub lane constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3); constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7); __m128i lol = _mm_shufflelo_epi16(self, mask_lo); __m128i loh = _mm_shufflelo_epi16(self, mask_hi); // generate temporary lanes return _mm_unpacklo_epi64(lol, loh); } // Only pick elements from the high lane XSIMD_IF_CONSTEXPR(detail::is_only_from_hi(mask)) { // permute within each sub lane constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3); constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7); __m128i hil = _mm_shufflehi_epi16(self, mask_lo); __m128i hih = _mm_shufflehi_epi16(self, mask_hi); // generate temporary lanes return _mm_unpackhi_epi64(hil, hih); } // Generic case // permute within each sub lane constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3); constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7); __m128i lol = _mm_shufflelo_epi16(self, mask_lo); __m128i loh = _mm_shufflelo_epi16(self, mask_hi); __m128i hil = _mm_shufflehi_epi16(self, mask_lo); __m128i hih = _mm_shufflehi_epi16(self, mask_hi); // generate temporary lanes __m128i lo = _mm_unpacklo_epi64(lol, loh); __m128i hi = _mm_unpackhi_epi64(hil, hih); // mask to choose the right lane constexpr auto blend_mask = mask < std::integral_constant(); // blend the two permutes return select(blend_mask, batch(lo), batch(hi)); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); } // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; _MM_TRANSPOSE4_PS(r0, r1, r2, r3); matrix_begin[0] = r0; matrix_begin[1] = r1; matrix_begin[2] = r2; matrix_begin[3] = r3; } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1]; matrix_begin[0] = _mm_unpacklo_pd(r0, r1); matrix_begin[1] = _mm_unpackhi_pd(r0, r1); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } // zip_hi template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { return _mm_unpackhi_ps(self, other); } template ::value>> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_unpackhi_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_unpackhi_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_unpackhi_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_unpackhi_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { return _mm_unpackhi_pd(self, other); } // zip_lo template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { return _mm_unpacklo_ps(self, other); } template ::value>> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_unpacklo_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_unpacklo_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_unpacklo_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_unpacklo_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { return _mm_unpacklo_pd(self, other); } // store_masked template XSIMD_INLINE void store_masked(float* mem, batch const& src, batch_bool_constant mask, aligned_mode, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(mask.countr_one() == 2) { _mm_storel_pi(reinterpret_cast<__m64*>(mem), src); } else XSIMD_IF_CONSTEXPR(mask.countl_one() == 2) { _mm_storeh_pi(reinterpret_cast<__m64*>(mem + 2), src); } else { store_masked(mem, src, mask, requires_arch {}); } } // get (must appear after first and swizzle so it can delegate through the xsimd API) namespace detail { // broadcast lane index I across a batch_constant matching batch::size template XSIMD_INLINE auto broadcast_lane_index(std::index_sequence) noexcept -> batch_constant, A, static_cast>(Is * 0 + I)...> { return {}; } template XSIMD_INLINE auto broadcast_lane_index() noexcept -> decltype(broadcast_lane_index(std::make_index_sequence::size> {})) { return {}; } } template XSIMD_INLINE typename std::enable_if::value && sizeof(T) <= 2, T>::type get(batch const& self, ::xsimd::index, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(I == 0) { return first(self, A {}); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return static_cast(_mm_extract_epi16(self, I)); } else { // SSE2 has no pextrb; byte-lane shift + movd is the shortest path for I>0. return static_cast(_mm_cvtsi128_si32(_mm_srli_si128(self, I)) & 0xFF); } } template XSIMD_INLINE typename std::enable_if<(std::is_integral::value && sizeof(T) >= 4) || std::is_floating_point::value, T>::type get(batch const& self, ::xsimd::index, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(I == 0) { return first(self, A {}); } else { return first(swizzle(self, detail::broadcast_lane_index(), A {}), A {}); } } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_sse3.hpp000066400000000000000000000045731517435117100242160ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE3_HPP #define XSIMD_SSE3_HPP #include "../types/xsimd_sse3_register.hpp" #include namespace xsimd { namespace kernel { using namespace types; // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]), _mm_hadd_ps(row[2], row[3])); } template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { return _mm_hadd_pd(row[0], row[1]); } // load_unaligned template ::value>> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return _mm_lddqu_si128((__m128i const*)mem); } // reduce_add template XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept { __m128 tmp0 = _mm_hadd_ps(self, self); __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0); return _mm_cvtss_f32(tmp1); } // reduce_mul template XSIMD_INLINE float reduce_mul(batch const& self, requires_arch) noexcept { __m128 tmp1 = _mm_mul_ps(self, _mm_movehl_ps(self, self)); __m128 tmp2 = _mm_mul_ps(tmp1, _mm_movehdup_ps(tmp1)); return _mm_cvtss_f32(tmp2); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_sse4_1.hpp000066400000000000000000000501231517435117100244270ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE4_1_HPP #define XSIMD_SSE4_1_HPP #include #include "../types/xsimd_sse4_1_register.hpp" #include "./common/xsimd_common_cast.hpp" namespace xsimd { namespace kernel { using namespace types; // any template ::value>> XSIMD_INLINE bool any(batch const& self, requires_arch) noexcept { return !_mm_testz_si128(self, self); } // ceil template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return _mm_ceil_ps(self); } template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return _mm_ceil_pd(self); } // bitwise_lshift multiple (constant) template XSIMD_INLINE batch bitwise_lshift( batch const& self, batch_constant, requires_arch) noexcept { constexpr auto mults = batch_constant(1u << Vs)...>(); return _mm_mullo_epi32(self, mults.as_batch()); } // fast_cast namespace detail { template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx __m128i xH = _mm_srai_epi32(x, 16); xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33); xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67 __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); // 2^52 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)"); return _mm_add_pd(f, _mm_castsi128_pd(xL)); } template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx __m128i xH = _mm_srli_epi64(x, 32); xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84 __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 detail::reassociation_barrier(f, "prevent (xH-C)+xL -> xH+(xL-C)"); return _mm_add_pd(f, _mm_castsi128_pd(xL)); } } // eq template ::value>> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_cmpeq_epi64(self, other); } else { return eq(self, other, ssse3 {}); } } // floor template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return _mm_floor_ps(self); } template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return _mm_floor_pd(self); } // get template ::value>> XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(I == 0) { return first(self, sse2 {}); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return static_cast(_mm_extract_epi8(self, I)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return static_cast(_mm_extract_epi16(self, I)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return static_cast(_mm_extract_epi32(self, I)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { #if defined(__x86_64__) return static_cast(_mm_extract_epi64(self, I)); #else return get(self, ::xsimd::index {}, sse2 {}); #endif } else { assert(false && "unsupported arch/op combination"); return {}; } } // insert template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_insert_epi8(self, val, I); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_insert_epi32(self, val, I); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { #if (!defined(_MSC_VER) && __x86_64__) || (_MSC_VER > 1900 && defined(_M_X64)) return _mm_insert_epi64(self, val, I); #else uint32_t lo, hi; memcpy(&lo, (reinterpret_cast(&val)), sizeof(lo)); memcpy(&hi, (reinterpret_cast(&val)) + 1, sizeof(hi)); return _mm_insert_epi32(_mm_insert_epi32(self, lo, 2 * I), hi, 2 * I + 1); #endif } else { return insert(self, val, pos, ssse3 {}); } } // load_unaligned template ::value && sizeof(T) > 1)>> XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { // GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this. // GCC/Clang/MSVC will turn it into the correct load. XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { #if defined(__x86_64__) uint64_t tmp; memcpy(&tmp, mem, sizeof(tmp)); auto val = _mm_cvtsi64_si128(tmp); #else __m128i val; memcpy(&val, mem, sizeof(uint64_t)); #endif return { _mm_sub_epi16(_mm_set1_epi8(0), _mm_cvtepu8_epi16(val)) }; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { uint32_t tmp; memcpy(&tmp, mem, sizeof(tmp)); return { _mm_sub_epi32(_mm_set1_epi8(0), _mm_cvtepu8_epi32(_mm_cvtsi32_si128(tmp))) }; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { uint16_t tmp; memcpy(&tmp, mem, sizeof(tmp)); return { _mm_sub_epi64(_mm_set1_epi8(0), _mm_cvtepu8_epi64(_mm_cvtsi32_si128((uint32_t)tmp))) }; } else { assert(false && "unsupported arch/op combination"); return __m128i {}; } } template XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept { return { _mm_castsi128_ps(load_unaligned(mem, batch_bool {}, r)) }; } template XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch r) noexcept { return { _mm_castsi128_pd(load_unaligned(mem, batch_bool {}, r)) }; } // max template ::value>> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_max_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_max_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_max_epi32(self, other); } else { return max(self, other, ssse3 {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_max_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_max_epu16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_max_epu32(self, other); } else { return max(self, other, ssse3 {}); } } } // load_stream template ::value, void>> XSIMD_INLINE batch load_stream(T const* mem, convert, requires_arch) noexcept { return _mm_stream_load_si128((__m128i*)mem); } template XSIMD_INLINE batch load_stream(float const* mem, convert, requires_arch) noexcept { return _mm_castsi128_ps(_mm_stream_load_si128((__m128i*)mem)); } template XSIMD_INLINE batch load_stream(double const* mem, convert, requires_arch) noexcept { return _mm_castsi128_pd(_mm_stream_load_si128((__m128i*)mem)); } // min template ::value>> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_min_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_min_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_min_epi32(self, other); } else { return min(self, other, ssse3 {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_min_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_min_epu16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_min_epu32(self, other); } else { return min(self, other, ssse3 {}); } } } // mul template ::value>> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_or_si128( _mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)), _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_mullo_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_mullo_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_add_epi64( _mm_mul_epu32(self, other), _mm_slli_epi64( _mm_add_epi64( _mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))), _mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))), 32)); } else { assert(false && "unsupported arch/op combination"); return {}; } } // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT); } template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT); } // select namespace detail { template XSIMD_INLINE constexpr T interleave(T const& cond) noexcept { return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA); } } template ::value>> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm_blendv_epi8(false_br, true_br, cond); } template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm_blendv_ps(false_br, true_br, cond); } template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm_blendv_pd(false_br, true_br, cond); } template ::value>> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { constexpr int mask = batch_bool_constant::mask(); XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_blend_epi16(false_br, true_br, mask); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { constexpr int imask = detail::interleave(mask); return _mm_blend_epi16(false_br, true_br, imask); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { constexpr int imask = detail::interleave(mask); constexpr int imask2 = detail::interleave(imask); return _mm_blend_epi16(false_br, true_br, imask2); } else { return select(batch_bool_constant(), true_br, false_br, ssse3 {}); } } template XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { constexpr int mask = batch_bool_constant::mask(); return _mm_blend_ps(false_br, true_br, mask); } template XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { constexpr int mask = batch_bool_constant::mask(); return _mm_blend_pd(false_br, true_br, mask); } // trunc template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return _mm_round_ps(self, _MM_FROUND_TO_ZERO); } template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return _mm_round_pd(self, _MM_FROUND_TO_ZERO); } // widen template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { __m128i x_lo = x; __m128i x_hi = _mm_unpackhi_epi64(x, x); __m128i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { XSIMD_IF_CONSTEXPR(std::is_signed::value) { lo = _mm_cvtepi32_epi64(x_lo); hi = _mm_cvtepi32_epi64(x_hi); } else { lo = _mm_cvtepu32_epi64(x_lo); hi = _mm_cvtepu32_epi64(x_hi); } } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { XSIMD_IF_CONSTEXPR(std::is_signed::value) { lo = _mm_cvtepi16_epi32(x_lo); hi = _mm_cvtepi16_epi32(x_hi); } else { lo = _mm_cvtepu16_epi32(x_lo); hi = _mm_cvtepu16_epi32(x_hi); } } else XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { XSIMD_IF_CONSTEXPR(std::is_signed::value) { lo = _mm_cvtepi8_epi16(x_lo); hi = _mm_cvtepi8_epi16(x_hi); } else { lo = _mm_cvtepu8_epi16(x_lo); hi = _mm_cvtepu8_epi16(x_hi); } } return { lo, hi }; } template XSIMD_INLINE std::array, 2> widen(batch const& x, requires_arch) noexcept { __m128 x_shuf = _mm_unpackhi_ps(x, x); __m128d lo = _mm_cvtps_pd(x); __m128d hi = _mm_cvtps_pd(x_shuf); return { lo, hi }; } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_sse4_2.hpp000066400000000000000000000032411517435117100244270ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE4_2_HPP #define XSIMD_SSE4_2_HPP #include #include "../types/xsimd_sse4_2_register.hpp" namespace xsimd { namespace kernel { using namespace types; // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpgt_epi64(other, self); } template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits::lowest())); auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits::lowest())); return _mm_cmpgt_epi64(xother, xself); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_ssse3.hpp000066400000000000000000000201761517435117100243760ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSSE3_HPP #define XSIMD_SSSE3_HPP #include #include #include "../types/xsimd_ssse3_register.hpp" #include "../types/xsimd_utils.hpp" namespace xsimd { namespace kernel { using namespace types; // abs template ::value && std::is_signed::value>> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm_abs_epi8(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm_abs_epi16(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm_abs_epi32(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm_abs_epi64(self); } else { assert(false && "unsupported arch/op combination"); return {}; } } // extract_pair namespace detail { template XSIMD_INLINE batch extract_pair(batch const&, batch const& other, std::size_t, std::index_sequence<>) noexcept { return other; } template XSIMD_INLINE batch extract_pair(batch const& self, batch const& other, std::size_t i, std::index_sequence) noexcept { if (i == I) { return _mm_alignr_epi8(self, other, sizeof(T) * I); } else return extract_pair(self, other, i, std::index_sequence()); } } template ::value>> XSIMD_INLINE batch extract_pair(batch const& self, batch const& other, std::size_t i, requires_arch) noexcept { constexpr std::size_t size = batch::size; assert(i < size && "index in bounds"); return detail::extract_pair(self, other, i, std::make_index_sequence()); } // reduce_add template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { __m128i tmp1 = _mm_hadd_epi16(self, self); __m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1); __m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2); return _mm_cvtsi128_si32(tmp3) & 0xFFFF; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { __m128i tmp1 = _mm_hadd_epi32(self, self); __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1); return _mm_cvtsi128_si32(tmp2); } else { return reduce_add(self, sse3 {}); } } // rotate_left template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return _mm_alignr_epi8(self, self, N); } template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return bitwise_cast(rotate_left(bitwise_cast(self), ssse3 {})); } template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return _mm_alignr_epi8(self, self, 2 * N); } template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return bitwise_cast(rotate_left(bitwise_cast(self), ssse3 {})); } // swizzle (dynamic mask) template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm_shuffle_epi8(self, mask); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm_shuffle_epi8(self, mask); } template XSIMD_INLINE std::enable_if_t::value, batch> swizzle(batch const& self, batch mask, requires_arch) noexcept { constexpr auto pikes = static_cast>(0x0706050403020100ul); constexpr auto comb = static_cast>(0x0101010101010101ul * sizeof(T)); return bitwise_cast(swizzle(bitwise_cast(self), bitwise_cast(mask * comb + pikes), ssse3 {})); } // swizzle (constant mask) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr batch_constant mask8; return _mm_shuffle_epi8(self, mask8.as_batch()); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, ssse3 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), ssse3 {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), ssse3 {}); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_sve.hpp000066400000000000000000001477671517435117100241530ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * Copyright (c) Yibo Cai * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SVE_HPP #define XSIMD_SVE_HPP #include #include #include "../config/xsimd_config.hpp" #include "../config/xsimd_macros.hpp" #include "../types/xsimd_sve_register.hpp" // Define a inline namespace with the explicit SVE vector size to avoid ODR violation // When dynamically dispatching between different SVE sizes. // While most code is safe from ODR violation as the size is already encoded in the // register (and hence batch) types, utilities can quickly fall prone to this issue. #define XSIMD_SVE_NAMESPACE XSIMD_CONCAT(sve, XSIMD_SVE_BITS) namespace xsimd { template struct batch_constant; namespace kernel { namespace detail_sve { inline namespace XSIMD_SVE_NAMESPACE { using xsimd::index; using xsimd::types::detail::sve_vector_type; // predicate creation XSIMD_INLINE svbool_t ptrue_impl(index<1>) noexcept { return svptrue_b8(); } XSIMD_INLINE svbool_t ptrue_impl(index<2>) noexcept { return svptrue_b16(); } XSIMD_INLINE svbool_t ptrue_impl(index<4>) noexcept { return svptrue_b32(); } XSIMD_INLINE svbool_t ptrue_impl(index<8>) noexcept { return svptrue_b64(); } template XSIMD_INLINE svbool_t ptrue() noexcept { return ptrue_impl(index {}); } // predicate loading template XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b64(M0, M1); } template XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b32(M0, M1, M2, M3); } template XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b16(M0, M1, M2, M3, M4, M5, M6, M7); } template XSIMD_INLINE svbool_t pmask() noexcept { return svdupq_b8(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15); } // count active lanes in a predicate XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); } XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); } XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); } XSIMD_INLINE uint64_t pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); } template XSIMD_INLINE uint64_t pcount(svbool_t p) noexcept { return pcount_impl(p, index {}); } // enable for signed integers or floating points template using enable_signed_int_or_floating_point_t = std::enable_if_t::value, int>; // `sizeless` is the matching sizeless SVE type. xsimd stores SVE // vectors as fixed-size attributed types (arm_sve_vector_bits), // which clang treats as implicitly convertible to every sizeless // SVE type — including multi-vector tuples — making the overloaded // svreinterpret_*/svsel/etc. intrinsics ambiguous. Static-casting // to `sizeless` first collapses the overload set to the single // 1-vector candidate. template using sizeless_t = xsimd::types::detail::sizeless_sve_vector_type; } // namespace XSIMD_SVE_NAMESPACE } // namespace detail_sve /********* * Load * *********/ template = 0> XSIMD_INLINE batch load_aligned(T const* src, convert, requires_arch) noexcept { return svld1(detail_sve::ptrue(), reinterpret_cast const*>(src)); } template = 0> XSIMD_INLINE batch load_unaligned(T const* src, convert, requires_arch) noexcept { return load_aligned(src, convert(), sve {}); } // load_masked template = 0> XSIMD_INLINE batch load_masked(T const* mem, batch_bool_constant, Mode, requires_arch) noexcept { return svld1(detail_sve::pmask(), reinterpret_cast const*>(mem)); } // load_complex template = 0> XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept { const T* buf = reinterpret_cast(mem); const auto tmp = svld2(detail_sve::ptrue(), buf); const auto real = svget2(tmp, 0); const auto imag = svget2(tmp, 1); return batch, A> { real, imag }; } template = 0> XSIMD_INLINE batch, A> load_complex_unaligned(std::complex const* mem, convert>, requires_arch) noexcept { return load_complex_aligned(mem, convert> {}, sve {}); } /********* * Store * *********/ template = 0> XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { svst1(detail_sve::ptrue(), reinterpret_cast*>(dst), src); } template = 0> XSIMD_INLINE void store_unaligned(T* dst, batch const& src, requires_arch) noexcept { store_aligned(dst, src, sve {}); } // store_complex template = 0> XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { using v2type = std::conditional_t<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>; v2type tmp {}; tmp = svset2(tmp, 0, src.real()); tmp = svset2(tmp, 1, src.imag()); T* buf = reinterpret_cast(dst); svst2(detail_sve::ptrue(), buf, tmp); } template = 0> XSIMD_INLINE void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { store_complex_aligned(dst, src, sve {}); } /****************** * scatter/gather * ******************/ namespace detail_sve { template using enable_sg_t = std::enable_if_t<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>; } // scatter template = 0> XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept { svst1_scatter_index(detail_sve::ptrue(), dst, index.data, src.data); } // gather template = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { return svld1_gather_index(detail_sve::ptrue(), src, index.data); } /******************** * Scalar to vector * ********************/ // broadcast template = 0> XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept { return svdup_n_u8(uint8_t(arg)); } template = 0> XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept { return svdup_n_s8(int8_t(arg)); } template = 0> XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept { return svdup_n_u16(uint16_t(arg)); } template = 0> XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept { return svdup_n_s16(int16_t(arg)); } template = 0> XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept { return svdup_n_u32(uint32_t(arg)); } template = 0> XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept { return svdup_n_s32(int32_t(arg)); } template = 0> XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept { return svdup_n_u64(uint64_t(arg)); } template = 0> XSIMD_INLINE batch broadcast(T arg, requires_arch) noexcept { return svdup_n_s64(int64_t(arg)); } template XSIMD_INLINE batch broadcast(float arg, requires_arch) noexcept { return svdup_n_f32(arg); } template XSIMD_INLINE batch broadcast(double arg, requires_arch) noexcept { return svdup_n_f64(arg); } template = 0> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return broadcast(val, sve {}); } /************** * Arithmetic * **************/ // add template = 0> XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svadd_x(detail_sve::ptrue(), lhs, rhs); } // sadd template = 0> XSIMD_INLINE batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svqadd(lhs, rhs); } // sub template = 0> XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svsub_x(detail_sve::ptrue(), lhs, rhs); } // ssub template = 0> XSIMD_INLINE batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svqsub(lhs, rhs); } // mul template = 0> XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svmul_x(detail_sve::ptrue(), lhs, rhs); } // div template = 4, int> = 0> XSIMD_INLINE batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svdiv_x(detail_sve::ptrue(), lhs, rhs); } // max template = 0> XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svmax_x(detail_sve::ptrue(), lhs, rhs); } // min template = 0> XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svmin_x(detail_sve::ptrue(), lhs, rhs); } // neg template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { return svreinterpret_u8(svneg_x(detail_sve::ptrue(), svreinterpret_s8(static_cast>(arg)))); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { return svreinterpret_u16(svneg_x(detail_sve::ptrue(), svreinterpret_s16(static_cast>(arg)))); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { return svreinterpret_u32(svneg_x(detail_sve::ptrue(), svreinterpret_s32(static_cast>(arg)))); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { return svreinterpret_u64(svneg_x(detail_sve::ptrue(), svreinterpret_s64(static_cast>(arg)))); } template = 0> XSIMD_INLINE batch neg(batch const& arg, requires_arch) noexcept { return svneg_x(detail_sve::ptrue(), arg); } // abs template = 0> XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept { return arg; } template = 0> XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept { return svabs_x(detail_sve::ptrue(), arg); } // fma: x * y + z template = 0> XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return svmad_x(detail_sve::ptrue(), x, y, z); } // fnma: z - x * y template = 0> XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return svmsb_x(detail_sve::ptrue(), x, y, z); } // fms: x * y - z template = 0> XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return -fnma(x, y, z, sve {}); } // fnms: - x * y - z template = 0> XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return -fma(x, y, z, sve {}); } /********************** * Logical operations * **********************/ // bitwise_and template = 0> XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svand_x(detail_sve::ptrue(), lhs, rhs); } template XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); const auto result_bits = svand_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } template XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); const auto result_bits = svand_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return svand_z(detail_sve::ptrue(), lhs, rhs); } // bitwise_andnot template = 0> XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svbic_x(detail_sve::ptrue(), lhs, rhs); } template XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); const auto result_bits = svbic_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } template XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); const auto result_bits = svbic_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return svbic_z(detail_sve::ptrue(), lhs, rhs); } // bitwise_or template = 0> XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svorr_x(detail_sve::ptrue(), lhs, rhs); } template XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); const auto result_bits = svorr_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } template XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); const auto result_bits = svorr_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return svorr_z(detail_sve::ptrue(), lhs, rhs); } // bitwise_xor template = 0> XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { return sveor_x(detail_sve::ptrue(), lhs, rhs); } template XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = svreinterpret_u32(static_cast>(lhs)); const auto rhs_bits = svreinterpret_u32(static_cast>(rhs)); const auto result_bits = sveor_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f32(result_bits); } template XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { const auto lhs_bits = svreinterpret_u64(static_cast>(lhs)); const auto rhs_bits = svreinterpret_u64(static_cast>(rhs)); const auto result_bits = sveor_x(detail_sve::ptrue(), lhs_bits, rhs_bits); return svreinterpret_f64(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return sveor_z(detail_sve::ptrue(), lhs, rhs); } // bitwise_not template = 0> XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { return svnot_x(detail_sve::ptrue(), arg); } template XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { const auto arg_bits = svreinterpret_u32(static_cast>(arg)); const auto result_bits = svnot_x(detail_sve::ptrue(), arg_bits); return svreinterpret_f32(result_bits); } template XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { const auto arg_bits = svreinterpret_u64(static_cast>(arg)); const auto result_bits = svnot_x(detail_sve::ptrue(), arg_bits); return svreinterpret_f64(result_bits); } template = 0> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept { return svnot_z(detail_sve::ptrue(), arg); } /********** * Shifts * **********/ namespace detail_sve { inline namespace XSIMD_SVE_NAMESPACE { template XSIMD_INLINE batch to_unsigned_batch_impl(batch const& arg, index<1>) noexcept { return svreinterpret_u8(static_cast>(arg)); } template XSIMD_INLINE batch to_unsigned_batch_impl(batch const& arg, index<2>) noexcept { return svreinterpret_u16(static_cast>(arg)); } template XSIMD_INLINE batch to_unsigned_batch_impl(batch const& arg, index<4>) noexcept { return svreinterpret_u32(static_cast>(arg)); } template XSIMD_INLINE batch to_unsigned_batch_impl(batch const& arg, index<8>) noexcept { return svreinterpret_u64(static_cast>(arg)); } template > XSIMD_INLINE batch to_unsigned_batch(batch const& arg) noexcept { return to_unsigned_batch_impl(arg, index {}); } } // namespace XSIMD_SVE_NAMESPACE } // namespace detail_sve // bitwise_lshift template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& arg, int n, requires_arch) noexcept { constexpr std::size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); return svlsl_x(detail_sve::ptrue(), arg, n); } template = 0> XSIMD_INLINE batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svlsl_x(detail_sve::ptrue(), lhs, detail_sve::to_unsigned_batch(rhs)); } // bitwise_rshift template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept { constexpr std::size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); return svlsr_x(detail_sve::ptrue(), arg, static_cast(n)); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svlsr_x(detail_sve::ptrue(), lhs, rhs); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept { constexpr std::size_t size = sizeof(typename batch::value_type) * 8; assert(0 <= n && static_cast(n) < size && "index in bounds"); return svasr_x(detail_sve::ptrue(), arg, static_cast>(n)); } template = 0> XSIMD_INLINE batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svasr_x(detail_sve::ptrue(), lhs, detail_sve::to_unsigned_batch(rhs)); } /************** * Reductions * **************/ // reduce_add template ::value_type, detail::enable_arithmetic_t = 0> XSIMD_INLINE V reduce_add(batch const& arg, requires_arch) noexcept { // sve integer reduction results are promoted to 64 bits return static_cast(svaddv(detail_sve::ptrue(), arg)); } // reduce_max template = 0> XSIMD_INLINE T reduce_max(batch const& arg, requires_arch) noexcept { return svmaxv(detail_sve::ptrue(), arg); } // reduce_min template = 0> XSIMD_INLINE T reduce_min(batch const& arg, requires_arch) noexcept { return svminv(detail_sve::ptrue(), arg); } // haddp template = 0> XSIMD_INLINE batch haddp(const batch* row, requires_arch) noexcept { constexpr std::size_t size = batch::size; T sums[size]; for (std::size_t i = 0; i < size; ++i) { sums[i] = reduce_add(row[i], sve {}); } return svld1(detail_sve::ptrue(), sums); } /*************** * Comparisons * ***************/ // eq template = 0> XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svcmpeq(detail_sve::ptrue(), lhs, rhs); } template = 0> XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { const auto neq_result = sveor_z(detail_sve::ptrue(), lhs, rhs); return svnot_z(detail_sve::ptrue(), neq_result); } // neq template = 0> XSIMD_INLINE batch_bool neq(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svcmpne(detail_sve::ptrue(), lhs, rhs); } template = 0> XSIMD_INLINE batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { return sveor_z(detail_sve::ptrue(), lhs, rhs); } // lt template = 0> XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svcmplt(detail_sve::ptrue(), lhs, rhs); } // le template = 0> XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svcmple(detail_sve::ptrue(), lhs, rhs); } // gt template = 0> XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svcmpgt(detail_sve::ptrue(), lhs, rhs); } // ge template = 0> XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svcmpge(detail_sve::ptrue(), lhs, rhs); } /*************** * Permutation * ***************/ // rotate_left template = 0> XSIMD_INLINE batch rotate_left(batch const& a, requires_arch) noexcept { return svext(a, a, N); } // swizzle (dynamic) template XSIMD_INLINE batch swizzle(batch const& arg, batch indices, requires_arch) noexcept { return svtbl(arg, indices); } template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch indices, requires_arch) noexcept { const auto real = swizzle(self.real(), indices, sve {}); const auto imag = swizzle(self.imag(), indices, sve {}); return batch>(real, imag); } // swizzle (static) template XSIMD_INLINE batch swizzle(batch const& arg, batch_constant indices, requires_arch) noexcept { static_assert(batch::size == sizeof...(idx), "invalid swizzle indices"); return swizzle(arg, indices.as_batch(), sve {}); } template XSIMD_INLINE batch, A> swizzle(batch, A> const& arg, batch_constant indices, requires_arch) noexcept { static_assert(batch, A>::size == sizeof...(idx), "invalid swizzle indices"); return swizzle(arg, indices.as_batch(), sve {}); } /************* * Selection * *************/ // extract_pair namespace detail_sve { inline namespace XSIMD_SVE_NAMESPACE { template XSIMD_INLINE batch extract_pair(batch const&, batch const& /*rhs*/, std::size_t, std::index_sequence<>) noexcept { assert(false && "extract_pair out of bounds"); return batch {}; } template XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence) noexcept { if (n == I) { return svext(rhs, lhs, I); } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } template XSIMD_INLINE batch extract_pair_impl(batch const& lhs, batch const& rhs, std::size_t n, std::index_sequence<0, Is...>) noexcept { if (n == 0) { return rhs; } else { return extract_pair(lhs, rhs, n, std::index_sequence()); } } } // namespace XSIMD_SVE_NAMESPACE } // namespace detail_sve template = 0> XSIMD_INLINE batch extract_pair(batch const& lhs, batch const& rhs, std::size_t n, requires_arch) noexcept { constexpr std::size_t size = batch::size; assert(n < size && "index in bounds"); return detail_sve::extract_pair_impl(lhs, rhs, n, std::make_index_sequence()); } // select template = 0> XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { return svsel(cond, static_cast>(a), static_cast>(b)); } template XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { b... }, true_br, false_br, sve {}); } // zip_lo template = 0> XSIMD_INLINE batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svzip1(lhs, rhs); } // zip_hi template = 0> XSIMD_INLINE batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { return svzip2(lhs, rhs); } /***************************** * Floating-point arithmetic * *****************************/ // rsqrt template = 0> XSIMD_INLINE batch rsqrt(batch const& arg, requires_arch) noexcept { return svrsqrte(arg); } // sqrt template = 0> XSIMD_INLINE batch sqrt(batch const& arg, requires_arch) noexcept { return svsqrt_x(detail_sve::ptrue(), arg); } // reciprocal template = 0> XSIMD_INLINE batch reciprocal(const batch& arg, requires_arch) noexcept { return svrecpe(arg); } /****************************** * Floating-point conversions * ******************************/ // fast_cast namespace detail_sve { inline namespace XSIMD_SVE_NAMESPACE { template = 0> XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { return svcvt_f32_x(detail_sve::ptrue(), arg); } template = 0> XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { return svcvt_f64_x(detail_sve::ptrue(), arg); } template XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { return svcvt_s32_x(detail_sve::ptrue(), arg); } template XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { return svcvt_u32_x(detail_sve::ptrue(), arg); } template XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { return svcvt_s64_x(detail_sve::ptrue(), arg); } template XSIMD_INLINE batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept { return svcvt_u64_x(detail_sve::ptrue(), arg); } } // namespace XSIMD_SVE_NAMESPACE } // namespace detail_sve /********* * Miscs * *********/ // set template XSIMD_INLINE batch set(batch const&, requires_arch, Args... args) noexcept { return detail_sve::sve_vector_type { args... }; } template XSIMD_INLINE batch, A> set(batch, A> const&, requires_arch, Args... args_complex) noexcept { return batch>(detail_sve::sve_vector_type { args_complex.real()... }, detail_sve::sve_vector_type { args_complex.imag()... }); } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { using U = as_unsigned_integer_t; const auto values = detail_sve::sve_vector_type { static_cast(args)... }; const auto zero = broadcast(static_cast(0), sve {}); return svcmpne(detail_sve::ptrue(), values, zero); } // insert namespace detail_sve { inline namespace XSIMD_SVE_NAMESPACE { // generate index sequence (iota) XSIMD_INLINE svuint8_t iota_impl(index<1>) noexcept { return svindex_u8(0, 1); } XSIMD_INLINE svuint16_t iota_impl(index<2>) noexcept { return svindex_u16(0, 1); } XSIMD_INLINE svuint32_t iota_impl(index<4>) noexcept { return svindex_u32(0, 1); } XSIMD_INLINE svuint64_t iota_impl(index<8>) noexcept { return svindex_u64(0, 1); } template >> XSIMD_INLINE V iota() noexcept { return iota_impl(index {}); } } // namespace XSIMD_SVE_NAMESPACE } // namespace detail_sve template = 0> XSIMD_INLINE batch insert(batch const& arg, T val, index, requires_arch) noexcept { // create a predicate with only the I-th lane activated const auto iota = detail_sve::iota(); const auto index_predicate = svcmpeq(detail_sve::ptrue(), iota, static_cast>(I)); return svsel(index_predicate, static_cast>(broadcast(val, sve {})), static_cast>(arg)); } // first template = 0> XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { return self.data[0]; } // all template = 0> XSIMD_INLINE bool all(batch_bool const& arg, requires_arch) noexcept { return detail_sve::pcount(arg) == batch_bool::size; } // any template = 0> XSIMD_INLINE bool any(batch_bool const& arg, requires_arch) noexcept { return svptest_any(arg, arg); } // bitwise_cast template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return svreinterpret_u8(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return svreinterpret_s8(static_cast>(arg)); } template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return svreinterpret_u16(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return svreinterpret_s16(static_cast>(arg)); } template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return svreinterpret_u32(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return svreinterpret_s32(static_cast>(arg)); } template = 0, detail::enable_sized_unsigned_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return svreinterpret_u64(static_cast>(arg)); } template = 0, detail::enable_sized_signed_t = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return svreinterpret_s64(static_cast>(arg)); } template = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return svreinterpret_f32(static_cast>(arg)); } template = 0> XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { return svreinterpret_f64(static_cast>(arg)); } // batch_bool_cast template = 0> XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& arg, batch_bool const&, requires_arch) noexcept { return arg.data; } // from_bool template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { return select(arg, batch(1), batch(0)); } // slide_left namespace detail_sve { inline namespace XSIMD_SVE_NAMESPACE { template struct slider_left { template XSIMD_INLINE batch operator()(batch const& arg) noexcept { using u8_vector = batch; const auto left = svdup_n_u8(0); const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data; const u8_vector result(svext(left, right, u8_vector::size - N)); return bitwise_cast(result, batch {}, sve {}); } }; template <> struct slider_left<0> { template XSIMD_INLINE batch operator()(batch const& arg) noexcept { return arg; } }; } // namespace XSIMD_SVE_NAMESPACE } // namespace detail_sve template = 0> XSIMD_INLINE batch slide_left(batch const& arg, requires_arch) noexcept { return detail_sve::slider_left()(arg); } // slide_right namespace detail_sve { inline namespace XSIMD_SVE_NAMESPACE { template struct slider_right { template XSIMD_INLINE batch operator()(batch const& arg) noexcept { using u8_vector = batch; const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data; const auto right = svdup_n_u8(0); const u8_vector result(svext(left, right, N)); return bitwise_cast(result, batch {}, sve {}); } }; template <> struct slider_right::size> { template XSIMD_INLINE batch operator()(batch const&) noexcept { return batch {}; } }; } // namespace XSIMD_SVE_NAMESPACE } // namespace detail_sve template = 0> XSIMD_INLINE batch slide_right(batch const& arg, requires_arch) noexcept { return detail_sve::slider_right()(arg); } // isnan template = 0> XSIMD_INLINE batch_bool isnan(batch const& arg, requires_arch) noexcept { return !(arg == arg); } // nearbyint template = 0> XSIMD_INLINE batch nearbyint(batch const& arg, requires_arch) noexcept { return svrintx_x(detail_sve::ptrue(), arg); } // nearbyint_as_int template XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept { const auto nearest = svrintx_x(detail_sve::ptrue(), arg); return svcvt_s32_x(detail_sve::ptrue(), nearest); } template XSIMD_INLINE batch nearbyint_as_int(batch const& arg, requires_arch) noexcept { const auto nearest = svrintx_x(detail_sve::ptrue(), arg); return svcvt_s64_x(detail_sve::ptrue(), nearest); } // ldexp template = 0> XSIMD_INLINE batch ldexp(const batch& x, const batch, A>& exp, requires_arch) noexcept { return svscale_x(detail_sve::ptrue(), x, exp); } } // namespace kernel } // namespace xsimd #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_vsx.hpp000066400000000000000000001203651517435117100241570ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_VSX_HPP #define XSIMD_VSX_HPP #include #include #include "../types/xsimd_vsx_register.hpp" #include "./common/xsimd_common_cast.hpp" #include namespace xsimd { template struct batch_bool_constant; template XSIMD_INLINE batch bitwise_cast(batch const& x) noexcept; template struct batch_constant; namespace kernel { template XSIMD_INLINE batch avg(batch const&, batch const&, requires_arch) noexcept; template XSIMD_INLINE batch avgr(batch const&, batch const&, requires_arch) noexcept; // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { return vec_abs(self.data); } template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { return vec_abs(self.data); } // add template ::value>> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return vec_add(self.data, other.data); } // all template ::value>> XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return vec_all_ne(self.data, vec_xor(self.data, self.data)); } // any template ::value>> XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return vec_any_ne(self.data, vec_xor(self.data, self.data)); } // avgr template ::value && sizeof(T) < 8>> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { return vec_avg(self.data, other.data); } template XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { return avgr(self, other, common {}); } template XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { return avgr(self, other, common {}); } // avg template ::value>> XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) < 8) { constexpr auto nbit = 8 * sizeof(T) - 1; auto adj = bitwise_cast(bitwise_cast>((self ^ other) << nbit) >> nbit); return avgr(self, other, A {}) - adj; } else { return avg(self, other, common {}); } } template XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { return avg(self, other, common {}); } template XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { return avg(self, other, common {}); } // batch_bool_cast template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return (typename batch_bool::register_type)self.data; } // bitwise_and template ::value>> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return vec_and(self.data, other.data); } template ::value>> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return vec_and(self.data, other.data); } // bitwise_andnot template ::value>> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return vec_and(self.data, vec_nor(other.data, other.data)); } template ::value>> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return self.data & ~other.data; } // bitwise_lshift template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { using shift_type = as_unsigned_integer_t; batch shift(static_cast(other)); return vec_sl(self.data, shift.data); } // bitwise_not template ::value>> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return vec_nor(self.data, self.data); } template ::value>> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return vec_nor(self.data, self.data); } // bitwise_or template ::value>> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return vec_or(self.data, other.data); } template ::value>> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return vec_or(self.data, other.data); } // bitwise_rshift template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { using shift_type = as_unsigned_integer_t; batch shift(static_cast(other)); XSIMD_IF_CONSTEXPR(std::is_signed::value) { return vec_sra(self.data, shift.data); } else { return vec_sr(self.data, shift.data); } } // bitwise_xor template ::value>> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return vec_xor(self.data, other.data); } template ::value>> XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return vec_xor(self.data, other.data); } // bitwise_cast template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return (typename batch::register_type)(self.data); } // broadcast template ::value>> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return vec_splats(val); } // ceil template ::value>> XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return vec_ceil(self.data); } // store_complex namespace detail { // complex_low template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return vec_mergeh(self.real().data, self.imag().data); } template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return vec_mergeh(self.real().data, self.imag().data); } // complex_high template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return vec_mergel(self.real().data, self.imag().data); } template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return vec_mergel(self.real().data, self.imag().data); } } // decr_if template ::value>> XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self + batch((typename batch::register_type)mask.data); } // div template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return vec_div(self.data, other.data); } template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return vec_div(self.data, other.data); } // fast_cast namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return vec_ctf(self.data, 0); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return vec_ctf(self.data, 0); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return vec_cts(self.data, 0); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return vec_ctu(self.data, 0); } } // fma template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vec_madd(x.data, y.data, z.data); } template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vec_madd(x.data, y.data, z.data); } // fms template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vec_msub(x.data, y.data, z.data); } template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vec_msub(x.data, y.data, z.data); } // eq template ::value>> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { auto res = vec_cmpeq(self.data, other.data); return *reinterpret_cast::register_type*>(&res); } template ::value>> XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { auto res = vec_cmpeq(self.data, other.data); return *reinterpret_cast::register_type*>(&res); } // first template ::value>> XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { return vec_extract(self.data, 0); } // floor template ::value>> XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return vec_floor(self.data); } // ge template ::value>> XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return vec_cmpge(self.data, other.data); } // gt template ::value>> XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return vec_cmpgt(self.data, other.data); } // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { auto tmp0 = vec_mergee(row[0].data, row[1].data); // v00 v10 v02 v12 auto tmp1 = vec_mergeo(row[0].data, row[1].data); // v01 v11 v03 v13 auto tmp4 = vec_add(tmp0, tmp1); // (v00 + v01, v10 + v11, v02 + v03, v12 + v13) auto tmp2 = vec_mergee(row[2].data, row[3].data); // v20 v30 v22 v32 auto tmp3 = vec_mergeo(row[2].data, row[3].data); // v21 v31 v23 v33 auto tmp5 = vec_add(tmp0, tmp1); // (v20 + v21, v30 + v31, v22 + v23, v32 + v33) auto tmp6 = vec_perm(tmp4, tmp5, (__vector unsigned char) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }); // (v00 + v01, v10 + v11, v20 + v21, v30 + v31 auto tmp7 = vec_perm(tmp4, tmp5, (__vector unsigned char) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 }); // (v02 + v03, v12 + v13, v12 + v13, v32 + v33) return vec_add(tmp6, tmp7); } template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { auto tmp0 = vec_mergee(row[0].data, row[1].data); // v00 v10 v02 v12 auto tmp1 = vec_mergeo(row[0].data, row[1].data); // v01 v11 v03 v13 return vec_add(tmp0, tmp1); } // incr_if template ::value>> XSIMD_INLINE batch incr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self - batch((typename batch::register_type)mask.data); } // insert template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept { return vec_insert(val, self.data, I); } // isnan template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return ~vec_cmpeq(self.data, self.data); } template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return ~vec_cmpeq(self.data, self.data); } // load_aligned template ::value>> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept { return vec_ld(0, reinterpret_cast::register_type*>(mem)); } // load_unaligned template ::value>> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return vec_vsx_ld(0, (typename batch::register_type const*)mem); } // load_complex namespace detail { template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { __vector unsigned char perme = { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; __vector unsigned char permo = { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; return { vec_perm(hi.data, lo.data, perme), vec_perm(hi.data, lo.data, permo) }; } template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { return { vec_mergee(hi.data, lo.data), vec_mergeo(hi.data, lo.data) }; } } // le template ::value>> XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return vec_cmple(self.data, other.data); } // lt template ::value>> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return vec_cmplt(self.data, other.data); } // max template ::value>> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return vec_max(self.data, other.data); } // min template ::value>> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return vec_min(self.data, other.data); } // mul template ::value>> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return self.data * other.data; } // neg template ::value>> XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return -(self.data); } // neq template ::value>> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~vec_cmpeq(self.data, other.data); } template ::value>> XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~vec_cmpeq(self.data, other.data); } // reciprocal template XSIMD_INLINE batch reciprocal(batch const& self, kernel::requires_arch) { return vec_re(self.data); } template XSIMD_INLINE batch reciprocal(batch const& self, kernel::requires_arch) { return vec_re(self.data); } // reduce_add template XSIMD_INLINE signed reduce_add(batch const& self, requires_arch) noexcept { auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0 auto tmp2 = vec_mergel(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0 auto tmp3 = vec_add(tmp1, tmp2); return vec_extract(tmp3, 0); } template XSIMD_INLINE unsigned reduce_add(batch const& self, requires_arch) noexcept { auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0 auto tmp2 = vec_mergel(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0 auto tmp3 = vec_add(tmp1, tmp2); return vec_extract(tmp3, 0); } template XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept { // FIXME: find an in-order approach auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0 auto tmp2 = vec_mergel(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0 auto tmp3 = vec_add(tmp1, tmp2); return vec_extract(tmp3, 0); } template XSIMD_INLINE double reduce_add(batch const& self, requires_arch) noexcept { auto tmp0 = vec_reve(self.data); // v1, v0 auto tmp1 = vec_add(self.data, tmp0); // v0 + v1, v1 + v0 return vec_extract(tmp1, 0); } template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { return reduce_add(self, common {}); } // reduce_mul template XSIMD_INLINE signed reduce_mul(batch const& self, requires_arch) noexcept { auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0 auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0 auto tmp3 = vec_mul(tmp1, tmp2); return vec_extract(tmp3, 0); } template XSIMD_INLINE unsigned reduce_mul(batch const& self, requires_arch) noexcept { auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0 auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0 auto tmp3 = vec_mul(tmp1, tmp2); return vec_extract(tmp3, 0); } template XSIMD_INLINE float reduce_mul(batch const& self, requires_arch) noexcept { // FIXME: find an in-order approach auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 auto tmp1 = vec_mul(self.data, tmp0); // v0 * v3, v1 * v2, v2 * v1, v3 * v0 auto tmp2 = vec_mergel(tmp1, tmp1); // v2 * v1, v2 * v1, v3 * v0, v3 * v0 auto tmp3 = vec_mul(tmp1, tmp2); return vec_extract(tmp3, 0); } template XSIMD_INLINE double reduce_mul(batch const& self, requires_arch) noexcept { auto tmp0 = vec_reve(self.data); // v1, v0 auto tmp1 = vec_mul(self.data, tmp0); // v0 * v1, v1 * v0 return vec_extract(tmp1, 0); } template ::value>> XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept { return reduce_mul(self, common {}); } // round // vec_round exists also for float vectors but is mapped to vrfin instruction which uses the wrong rounding mode #if defined __has_builtin && __has_builtin(__builtin_vsx_xvrspi) template XSIMD_INLINE batch round(batch const& self, requires_arch) noexcept { return __builtin_vsx_xvrspi(self.data); } #endif // For double vectors vec_round uses xvrdpi which does the right thing template XSIMD_INLINE batch round(batch const& self, requires_arch) noexcept { return vec_round(self.data); } // rsqrt template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return vec_rsqrt(val.data); } template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return vec_rsqrt(val.data); } // select template ::value>> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return vec_sel(false_br.data, true_br.data, cond.data); } template ::value>> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, vsx {}); } // shuffle template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept { return vec_perm(x.data, y.data, (__vector unsigned char) { 4 * I0 + 0, 4 * I0 + 1, 4 * I0 + 2, 4 * I0 + 3, 4 * I1 + 0, 4 * I1 + 1, 4 * I1 + 2, 4 * I1 + 3, 4 * I2 + 0, 4 * I2 + 1, 4 * I2 + 2, 4 * I2 + 3, 4 * I3 + 0, 4 * I3 + 1, 4 * I3 + 2, 4 * I3 + 3 }); } template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept { return vec_perm(x.data, y.data, (__vector unsigned char) { 8 * I0 + 0, 8 * I0 + 1, 8 * I0 + 2, 8 * I0 + 3, 8 * I0 + 4, 8 * I0 + 5, 8 * I0 + 6, 8 * I0 + 7, 8 * I1 + 0, 8 * I1 + 1, 8 * I1 + 2, 8 * I1 + 3, 8 * I1 + 4, 8 * I1 + 5, 8 * I1 + 6, 8 * I1 + 7, }); } // sqrt template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return vec_sqrt(val.data); } template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return vec_sqrt(val.data); } // slide_left template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(N == batch::size * sizeof(T)) { return batch(0); } else { auto slider = vec_splats((uint8_t)(8 * N)); return (typename batch::register_type)vec_slo(x.data, slider); } } // slide_right template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(N == batch::size * sizeof(T)) { return batch(0); } else { auto slider = vec_splats((uint8_t)(8 * N)); return (typename batch::register_type)vec_sro((__vector unsigned char)x.data, slider); } } // sadd template ::value && sizeof(T) != 8>> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { return vec_adds(self.data, other.data); } // set template XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return typename batch::register_type { values... }; } template ::value>> XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return typename batch_bool::register_type { static_cast::register_type>()[0])>(values ? -1LL : 0LL)... }; } // ssub template ::value && sizeof(T) == 1>> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { return vec_subs(self.data, other.data); } // store_aligned template ::value>> XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept { return vec_st(self.data, 0, reinterpret_cast::register_type*>(mem)); } // store_unaligned template ::value>> XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { return vec_vsx_st(self.data, 0, reinterpret_cast::register_type*>(mem)); } // sub template ::value>> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return vec_sub(self.data, other.data); } // swizzle template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return vec_perm(self.data, self.data, (__vector unsigned char) { 4 * V0 + 0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3, 4 * V1 + 0, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3, 4 * V2 + 0, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3, 4 * V3 + 0, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 }); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return vec_perm(self.data, self.data, (__vector unsigned char) { 8 * V0 + 0, 8 * V0 + 1, 8 * V0 + 2, 8 * V0 + 3, 8 * V0 + 4, 8 * V0 + 5, 8 * V0 + 6, 8 * V0 + 7, 8 * V1 + 0, 8 * V1 + 1, 8 * V1 + 2, 8 * V1 + 3, 8 * V1 + 4, 8 * V1 + 5, 8 * V1 + 6, 8 * V1 + 7, }); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return vec_perm(self.data, self.data, (__vector unsigned char) { 8 * V0 + 0, 8 * V0 + 1, 8 * V0 + 2, 8 * V0 + 3, 8 * V0 + 4, 8 * V0 + 5, 8 * V0 + 6, 8 * V0 + 7, 8 * V1 + 0, 8 * V1 + 1, 8 * V1 + 2, 8 * V1 + 3, 8 * V1 + 4, 8 * V1 + 5, 8 * V1 + 6, 8 * V1 + 7, }); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, vsx {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return vec_perm(self.data, self.data, (__vector unsigned char) { 4 * V0 + 0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3, 4 * V1 + 0, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3, 4 * V2 + 0, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3, 4 * V3 + 0, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 }); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, vsx {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return vec_perm(self.data, self.data, (__vector unsigned char) { 2 * V0 + 0, 2 * V0 + 1, 2 * V1 + 0, 2 * V1 + 1, 2 * V2 + 0, 2 * V2 + 1, 2 * V3 + 0, 2 * V3 + 1, 2 * V4 + 0, 2 * V4 + 1, 2 * V5 + 0, 2 * V5 + 1, 2 * V6 + 0, 2 * V6 + 1, 2 * V7 + 0, 2 * V7 + 1 }); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, vsx {})); } // trunc template ::value>> XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return vec_trunc(self.data); } // widen template XSIMD_INLINE std::array, 2> widen(batch const& x, requires_arch) noexcept { return { batch(vec_doublel(x.data)), batch(vec_doubleh(x.data)) }; } template ::value>> XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { auto even = vec_mule(x.data, vec_splats(T(1))); // x0, x2, x4, x6 auto odd = vec_mulo(x.data, vec_splats(T(1))); // x1, x3, x5, x7 return { batch, A>(vec_mergel(even, odd)), batch, A>(vec_mergeh(even, odd)) }; } // zip_hi template ::value>> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { return vec_mergel(self.data, other.data); } // zip_lo template ::value>> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { return vec_mergeh(self.data, other.data); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_vxe.hpp000066400000000000000000001036461517435117100241440ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Andreas Krebbel * * Based on xsimd_vsx.hpp * * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_VXE_HPP #define XSIMD_VXE_HPP #include #include "../types/xsimd_vxe_register.hpp" namespace xsimd { namespace kernel { using namespace types; using v1ti = __int128 __attribute__((vector_size(16))); using v4sf = float __attribute__((vector_size(16))); using v2df = double __attribute__((vector_size(16))); using uv2di = unsigned long long int __attribute__((vector_size(16))); using v2di = long long int __attribute__((vector_size(16))); using uv4si = unsigned int __attribute__((vector_size(16))); using v4si = int __attribute__((vector_size(16))); using uv8hi = unsigned short int __attribute__((vector_size(16))); using v8hi = short int __attribute__((vector_size(16))); using uv16qi = unsigned char __attribute__((vector_size(16))); using v16qi = signed char __attribute__((vector_size(16))); // builtin_t - the scalar type as it would be used for a vector intrinsic // VXE vector intrinsics do not support long, unsigned long, and char // The builtin definition can be used to map the incoming // type to the right one to be used with the intrinsics. template struct builtin_scalar { using type = T; }; template <> struct builtin_scalar { using type = unsigned long long; }; template <> struct builtin_scalar { using type = long long; }; template <> struct builtin_scalar { using type = typename std::conditional::value, signed char, unsigned char>::type; }; template using builtin_t = typename builtin_scalar::type; // bitwise_cast template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return (typename batch::register_type)(self.data); } // batch_bool_cast template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return (typename batch_bool::register_type)self.data; } // load // load_unaligned template ::value>> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return (typename batch::register_type)vec_xl(0, (builtin_t*)mem); } // load_aligned template ::value>> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept { return load_unaligned(mem, kernel::convert {}, vxe {}); } // load_complex namespace detail { template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { // Interleave real and imaginary parts // hi = [r0, i0, r1, i1], lo = [r2, i2, r3, i3] // We need: real = [r0, r1, r2, r3], imag = [i0, i1, i2, i3] using v4sf = float __attribute__((vector_size(16))); uv16qi perm_real = (uv16qi) { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; uv16qi perm_imag = (uv16qi) { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; v4sf real = vec_perm((v4sf)hi.data, (v4sf)lo.data, perm_real); v4sf imag = vec_perm((v4sf)hi.data, (v4sf)lo.data, perm_imag); return { batch(real), batch(imag) }; } template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { // hi = [r0, i0], lo = [r1, i1] // We need: real = [r0, r1], imag = [i0, i1] using v2df = double __attribute__((vector_size(16))); uv16qi perm_real = (uv16qi) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }; uv16qi perm_imag = (uv16qi) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 }; v2df real = vec_perm((v2df)hi.data, (v2df)lo.data, perm_real); v2df imag = vec_perm((v2df)hi.data, (v2df)lo.data, perm_imag); return { batch(real), batch(imag) }; } template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { uv16qi perm = (uv16qi) { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 }; return batch(vec_perm((v4sf)self.real().data, (v4sf)self.imag().data, perm)); } template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { uv16qi perm = (uv16qi) { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 }; return batch(vec_perm((v4sf)self.real().data, (v4sf)self.imag().data, perm)); } template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { uv16qi perm = (uv16qi) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }; return batch(vec_perm((v2df)self.real().data, (v2df)self.imag().data, perm)); } template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { uv16qi perm = (uv16qi) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 }; return batch(vec_perm((v2df)self.real().data, (v2df)self.imag().data, perm)); } } // store template XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept { vec_xst(src.data, 0, (builtin_t*)dst); } template XSIMD_INLINE void store_unaligned(T* dst, batch const& src, requires_arch) noexcept { store_aligned(dst, src, vxe {}); } // set template XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return typename batch::register_type { values... }; } template ::value>> XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return typename batch_bool::register_type { static_cast::register_type>()[0])>(values ? -1LL : 0LL)... }; } // first template ::value>> XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { return self.data[0]; } // insert template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept { // vec_insert on float is broken with clang batch out(self); out.data[I] = val; return out; } // eq template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return self.data == other.data; } template XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return self.data == other.data; } template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return self.data < other.data; } template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return self.data <= other.data; } // neq template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~(self.data == other.data); } template XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return bitwise_xor(self, other); } // sub template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return self.data - other.data; } // broadcast template ::value>> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { return vec_splats(static_cast>(val)); } // abs template ::value, void>::type> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { return vec_abs(self.data); } // bitwise_and template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return (typename batch::register_type)((v4si)self.data & (v4si)other.data); } template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return self.data & other.data; } // bitwise_or template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return (typename batch::register_type)((v4si)self.data | (v4si)other.data); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return self.data | other.data; } // bitwise_xor template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return (typename batch::register_type)((v4si)self.data ^ (v4si)other.data); } template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return self.data ^ other.data; } // bitwise_not template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { // ~ operator does not work on floating point vectors return (typename batch::register_type)(~(v4si)self.data); } template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return ~self.data; } // bitwise_andnot template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return (typename batch::register_type)((v4si)self.data & ~(v4si)other.data); } template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return self.data & ~other.data; } // div template ::value>> XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return self.data / other.data; } // neg template XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return (typename batch::register_type) { 0 } - self.data; } // add template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return self.data + other.data; } // all template XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return ((v1ti)self.data)[0] == -1; } // any template XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return ((v1ti)self.data)[0] != 0; } // avgr template ::value>> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { return vec_avg(self.data, other.data); } // max template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return vec_max(self.data, other.data); } // min template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return vec_min(self.data, other.data); } // fma template ::value>> XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vec_madd(x.data, y.data, z.data); } // fms template ::value>> XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return vec_msub(x.data, y.data, z.data); } // mul template ::value>> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return self.data * other.data; } // haddp template XSIMD_INLINE batch haddp(batch const* r, requires_arch) noexcept { v4sf lo01, hi01, lo23, hi23, sum01, sum23, sumeven, sumodd; lo01 = vec_mergel(r[0].data, r[1].data); // { r[0][2], r[1][2], r[0][3], r[1][3] } hi01 = vec_mergeh(r[0].data, r[1].data); // { r[0][0], r[1][0], r[0][1], r[1][1] } lo23 = vec_mergel(r[2].data, r[3].data); // { r[2][2], r[2][2], r[3][3], r[3][3] } hi23 = vec_mergeh(r[2].data, r[3].data); // { r[2][0], r[2][0], r[3][1], r[3][1] } sum01 = lo01 + hi01; // { r[0][0] + r[0][2], r[1][0] + r[1][2], r[0][1] + r[0][3], r[1][1] + r[1][3] } sum23 = lo23 + hi23; // { r[2][0] + r[2][2], r[3][0] + r[3][2], r[2][1] + r[2][3], r[3][1] + r[3][3] } sumeven = (v4sf)vec_mergeh((v2di)sum01, (v2di)sum23); // { r[0][0] + r[0][2], r[1][0] + r[1][2], r[2][0] + r[2][2], r[3][0] + r[3][2] } sumodd = (v4sf)vec_mergel((v2di)sum01, (v2di)sum23); // { r[0][1] + r[0][3], r[1][1] + r[1][3], r[2][1] + r[2][3], r[3][1] + r[3][3] } return sumeven + sumodd; } template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { return vec_mergeh(row[0].data, row[1].data) + vec_mergel(row[0].data, row[1].data); } // reduce_add template XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept { v4sf shifted_64 = vec_sld(self.data, self.data, 8); v4sf added_1 = self.data + shifted_64; v4sf shifted_32 = vec_sld(added_1, added_1, 4); return (added_1 + shifted_32)[0]; } template XSIMD_INLINE double reduce_add(batch const& self, requires_arch) noexcept { return (self.data + vec_sld(self.data, self.data, 8))[0]; } template XSIMD_INLINE uint64_t reduce_add(batch const& self, requires_arch) noexcept { uv2di shifted = vec_sld((uv2di)self.data, (uv2di)self.data, 8); uv2di sum = (uv2di)self.data + shifted; return (uint64_t)sum[0]; } template XSIMD_INLINE int64_t reduce_add(batch const& self, requires_arch) noexcept { v2di shifted = vec_sld((v2di)self.data, (v2di)self.data, 8); v2di sum = (v2di)self.data + shifted; return (int64_t)sum[0]; } template ::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { using t = typename batch::register_type; t shifted_64 = vec_sld(self.data, self.data, 8); t added_1 = self.data + shifted_64; t shifted_32 = vec_sld(added_1, added_1, 4); return (added_1 + shifted_32)[0]; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { using t = typename batch::register_type; t shifted_64 = vec_sld(self.data, self.data, 8); t added_1 = self.data + shifted_64; t shifted_32 = vec_sld(added_1, added_1, 4); t added_2 = added_1 + shifted_32; t shifted_16 = vec_sld(added_2, added_2, 2); return (added_2 + shifted_16)[0]; } else { using t = typename batch::register_type; t shifted_64 = vec_sld(self.data, self.data, 8); t added_1 = self.data + shifted_64; t shifted_32 = vec_sld(added_1, added_1, 4); t added_2 = added_1 + shifted_32; t shifted_16 = vec_sld(added_2, added_2, 2); t added_3 = added_2 + shifted_16; t shifted_8 = vec_sld(added_3, added_3, 1); return (added_3 + shifted_8)[0]; } } // select template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return vec_sel(false_br.data, true_br.data, cond.data); } template ::value>> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, vxe {}); } // slide_left template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(N == batch::size * sizeof(T)) { return batch(0); } else { auto shift_count = vec_splats((uint8_t)(8 * N)); return vec_sll(x.data, shift_count); } } // slide_right template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(N == batch::size * sizeof(T)) { return batch(0); } else { auto shift_count = vec_splats((uint8_t)(8 * N)); return vec_srl(x.data, shift_count); } } // sqrt template ::value>> XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return vec_sqrt(val.data); } // rsqrt template ::value>> XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return batch(T(1)) / sqrt(val, vxe {}); } // shuffle template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept { return vec_perm(x.data, y.data, (__vector unsigned char) { 4 * I0 + 0, 4 * I0 + 1, 4 * I0 + 2, 4 * I0 + 3, 4 * I1 + 0, 4 * I1 + 1, 4 * I1 + 2, 4 * I1 + 3, 4 * I2 + 0, 4 * I2 + 1, 4 * I2 + 2, 4 * I2 + 3, 4 * I3 + 0, 4 * I3 + 1, 4 * I3 + 2, 4 * I3 + 3 }); } template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept { return vec_perm(x.data, y.data, (__vector unsigned char) { 8 * I0 + 0, 8 * I0 + 1, 8 * I0 + 2, 8 * I0 + 3, 8 * I0 + 4, 8 * I0 + 5, 8 * I0 + 6, 8 * I0 + 7, 8 * I1 + 0, 8 * I1 + 1, 8 * I1 + 2, 8 * I1 + 3, 8 * I1 + 4, 8 * I1 + 5, 8 * I1 + 6, 8 * I1 + 7, }); } // swizzle // 16 x 8bit template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); uv16qi perm = (uv16qi) { Values... }; return vec_perm(self.data, self.data, perm); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); uv16qi perm = (uv16qi) { Values... }; return vec_perm(self.data, self.data, perm); } // 8 x 16 bit template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { uv16qi perm = (uv16qi) { 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1, 2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1 }; return vec_perm(self.data, self.data, perm); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { uv16qi perm = (uv16qi) { 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1, 2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1 }; return vec_perm(self.data, self.data, perm); } // 4 x 32 bit template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { uv16qi perm = (uv16qi) { 4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3, 4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3, 4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3, 4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 }; return vec_perm(self.data, self.data, perm); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { uv16qi perm = (uv16qi) { 4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3, 4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3, 4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3, 4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 }; return vec_perm(self.data, self.data, perm); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { uv16qi perm = (uv16qi) { 4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3, 4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3, 4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3, 4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 }; return vec_perm(self.data, self.data, perm); } // 2 x 64 bit template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { using out = typename batch::register_type; uv16qi perm = (uv16qi) { 8 * V0 + 0, 8 * V0 + 1, 8 * V0 + 2, 8 * V0 + 3, 8 * V0 + 4, 8 * V0 + 5, 8 * V0 + 6, 8 * V0 + 7, 8 * V1 + 0, 8 * V1 + 1, 8 * V1 + 2, 8 * V1 + 3, 8 * V1 + 4, 8 * V1 + 5, 8 * V1 + 6, 8 * V1 + 7, }; return (out)vec_perm((uv2di)self.data, (uv2di)self.data, perm); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { using out = typename batch::register_type; uv16qi perm = (uv16qi) { 8 * V0 + 0, 8 * V0 + 1, 8 * V0 + 2, 8 * V0 + 3, 8 * V0 + 4, 8 * V0 + 5, 8 * V0 + 6, 8 * V0 + 7, 8 * V1 + 0, 8 * V1 + 1, 8 * V1 + 2, 8 * V1 + 3, 8 * V1 + 4, 8 * V1 + 5, 8 * V1 + 6, 8 * V1 + 7, }; return (out)vec_perm((v2di)self.data, (v2di)self.data, perm); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { uv16qi perm = (uv16qi) { 8 * V0 + 0, 8 * V0 + 1, 8 * V0 + 2, 8 * V0 + 3, 8 * V0 + 4, 8 * V0 + 5, 8 * V0 + 6, 8 * V0 + 7, 8 * V1 + 0, 8 * V1 + 1, 8 * V1 + 2, 8 * V1 + 3, 8 * V1 + 4, 8 * V1 + 5, 8 * V1 + 6, 8 * V1 + 7, }; return vec_perm(self.data, self.data, perm); } // zip_hi template ::value>> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { return vec_mergel(self.data, other.data); } // zip_lo template ::value>> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { return vec_mergeh(self.data, other.data); } // bitwise_rshift template ::value, void>::type> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { return self.data >> other; } // bitwise_lshift template ::value, void>::type> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { return self.data << other; } // isnan template ::value>> XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return ~vec_cmpeq(self.data, self.data); } // ceil template ::value>> XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return vec_ceil(self.data); } // floor template ::value>> XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return vec_floor(self.data); } // round // vec_round rounds ties to even instead of zero #if defined __has_builtin && __has_builtin(__builtin_s390_vfi) template ::value>> XSIMD_INLINE batch round(batch const& self, requires_arch) noexcept { return __builtin_s390_vfi(self.data, 4, 1); } #endif // trunc template ::value>> XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return vec_trunc(self.data); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/arch/xsimd_wasm.hpp000066400000000000000000002261311517435117100243040ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * Copyright (c) Anutosh Bhat * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_WASM_HPP #define XSIMD_WASM_HPP #include #include "../types/xsimd_wasm_register.hpp" #include "./common/xsimd_common_cast.hpp" namespace xsimd { template struct batch_bool_constant; template XSIMD_INLINE batch bitwise_cast(batch const& x) noexcept; template struct batch_constant; namespace kernel { using namespace types; // fwd template XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept; template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept; template XSIMD_INLINE batch avg(batch const&, batch const&, requires_arch) noexcept; template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; // abs template ::value && std::is_signed::value>> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_abs(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_abs(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_abs(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_abs(self); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { return wasm_f32x4_abs(self); } template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { return wasm_f64x2_abs(self); } // add template ::value>> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_add(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_add(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_add(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_add(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_add(self, other); } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_add(self, other); } // avgr template ::value>> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_avgr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_avgr(self, other); } else { return avgr(self, other, common {}); } } // avg template ::value>> XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto adj = ((self ^ other) << 7) >> 7; return avgr(self, other, A {}) - adj; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto adj = ((self ^ other) << 15) >> 15; return avgr(self, other, A {}) - adj; } else { return avg(self, other, common {}); } } // all template XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return wasm_i32x4_bitmask(self) == 0x0F; } template XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return wasm_i64x2_bitmask(self) == 0x03; } template ::value>> XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return wasm_i8x16_bitmask(self) == 0xFFFF; } // any template XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return wasm_i32x4_bitmask(self) != 0; } template XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return wasm_i64x2_bitmask(self) != 0; } template ::value>> XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return wasm_i8x16_bitmask(self) != 0; } // batch_bool_cast template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return { bitwise_cast(batch(self.data)).data }; } // bitwise_and template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return wasm_v128_and(self, other); } template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_v128_and(self, other); } // bitwise_andnot template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return wasm_v128_andnot(self, other); } template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_v128_andnot(self, other); } // bitwise_cast template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return batch(self.data); } // bitwise_or template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return wasm_v128_or(self, other); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_v128_or(self, other); } // bitwise_lshift template ::value>> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_shl(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_shl(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_shl(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_shl(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } // bitwise_rshift template ::value>> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_shr(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_u32x4_shr(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_u64x2_shr(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } } // bitwise_not template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return wasm_v128_not(self); } template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return wasm_v128_not(self); } // bitwise_xor template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return wasm_v128_xor(self, other); } template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_v128_xor(self, other); } // broadcast template batch XSIMD_INLINE broadcast(float val, requires_arch) noexcept { return wasm_f32x4_splat(val); } template ::value>> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_splat(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_splat(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_splat(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_splat(val); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch broadcast(double val, requires_arch) noexcept { return wasm_f64x2_splat(val); } // ceil template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return wasm_f32x4_ceil(self); } template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return wasm_f64x2_ceil(self); } // div template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_div(self, other); } template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_div(self, other); } // eq template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_eq(self, other); } template XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_i32x4_eq(self, other); } template ::value>> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_eq(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template ::value>> XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_eq(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_eq(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_eq(self, other); } template XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_i64x2_eq(self, other); } // fast_cast namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return wasm_f32x4_convert_i32x4(self); } template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to wasm v128_t xH = wasm_u64x2_shr(x, 32); xH = wasm_v128_or(xH, wasm_f64x2_splat(19342813113834066795298816.)); // 2^84 v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); // 2^52 v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(19342813118337666422669312.)); // 2^84 + 2^52 return wasm_f64x2_add(f, xL); } template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to wasm v128_t xH = wasm_i32x4_shr(x, 16); xH = wasm_v128_and(xH, wasm_i16x8_make(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); xH = wasm_i64x2_add(xH, wasm_f64x2_splat(442721857769029238784.)); // 3*2^67 v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); // 2^52 v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(442726361368656609280.)); // 3*2^67 + 2^52 return wasm_f64x2_add(f, xL); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return wasm_i32x4_make( static_cast(wasm_f32x4_extract_lane(self, 0)), static_cast(wasm_f32x4_extract_lane(self, 1)), static_cast(wasm_f32x4_extract_lane(self, 2)), static_cast(wasm_f32x4_extract_lane(self, 3))); } } // first template XSIMD_INLINE float first(batch const& self, requires_arch) noexcept { return wasm_f32x4_extract_lane(self, 0); } template ::value>> XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_extract_lane(self, 0); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_extract_lane(self, 0); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_extract_lane(self, 0); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_extract_lane(self, 0); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE double first(batch const& self, requires_arch) noexcept { return wasm_f64x2_extract_lane(self, 0); } // floor template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return wasm_f32x4_floor(self); } template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return wasm_f64x2_floor(self); } // from_mask template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint32_t lut[][4] = { { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }, { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }, { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }, { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }, { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, }; assert(!(mask & ~0xFul) && "inbound mask"); return wasm_v128_load((const v128_t*)lut[mask]); } template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut[][4] = { { 0x0000000000000000ul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, }; assert(!(mask & ~0x3ul) && "inbound mask"); return wasm_v128_load((const v128_t*)lut[mask]); } template ::value>> XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut64[] = { 0x0000000000000000, 0x000000000000FFFF, 0x00000000FFFF0000, 0x00000000FFFFFFFF, 0x0000FFFF00000000, 0x0000FFFF0000FFFF, 0x0000FFFFFFFF0000, 0x0000FFFFFFFFFFFF, 0xFFFF000000000000, 0xFFFF00000000FFFF, 0xFFFF0000FFFF0000, 0xFFFF0000FFFFFFFF, 0xFFFFFFFF00000000, 0xFFFFFFFF0000FFFF, 0xFFFFFFFFFFFF0000, 0xFFFFFFFFFFFFFFFF, }; alignas(A::alignment()) static const uint32_t lut32[] = { 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF, }; alignas(A::alignment()) static const uint32_t lut16[][4] = { { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }, { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }, { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }, { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }, { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }, { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, }; alignas(A::alignment()) static const uint64_t lut8[][4] = { { 0x0000000000000000ul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, }; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(!(mask & ~0xFFFF) && "inbound mask"); return wasm_i32x4_make(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(!(mask & ~0xFF) && "inbound mask"); return wasm_i64x2_make(lut64[mask & 0xF], lut64[mask >> 4]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { assert(!(mask & ~0xFul) && "inbound mask"); return wasm_v128_load((const v128_t*)lut16[mask]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { assert(!(mask & ~0x3ul) && "inbound mask"); return wasm_v128_load((const v128_t*)lut8[mask]); } } // ge template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_ge(self, other); } template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_ge(self, other); } // gt template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_gt(self, other); } template ::value>> XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_gt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_gt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_gt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_gt(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_gt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_gt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_u32x4_gt(self, other); } else { return gt(self, other, common {}); } } } template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_gt(self, other); } // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { v128_t tmp0 = wasm_i32x4_shuffle(row[0], row[1], 0, 4, 1, 5); v128_t tmp1 = wasm_i32x4_shuffle(row[0], row[1], 2, 6, 3, 7); v128_t tmp2 = wasm_i32x4_shuffle(row[2], row[3], 2, 6, 3, 7); tmp0 = wasm_f32x4_add(tmp0, tmp1); tmp1 = wasm_i32x4_shuffle(row[2], row[3], 0, 4, 1, 5); tmp1 = wasm_f32x4_add(tmp1, tmp2); tmp2 = wasm_i32x4_shuffle(tmp1, tmp0, 6, 7, 2, 3); tmp0 = wasm_i32x4_shuffle(tmp0, tmp1, 0, 1, 4, 5); return wasm_f32x4_add(tmp0, tmp2); } template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { return wasm_f64x2_add(wasm_i64x2_shuffle(row[0], row[1], 0, 2), wasm_i64x2_shuffle(row[0], row[1], 1, 3)); } // insert template XSIMD_INLINE batch insert(batch const& self, float val, index pos, requires_arch) noexcept { return wasm_f32x4_replace_lane(self, pos, val); } template ::value>> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_replace_lane(self, pos, val); } else { assert(false && "unsupported arch/op combination"); return {}; } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_u32x4_replace_lane(self, pos, val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_u64x2_replace_lane(self, pos, val); } else { assert(false && "unsupported arch/op combination"); return {}; } } } template XSIMD_INLINE batch insert(batch const& self, double val, index pos, requires_arch) noexcept { return wasm_f64x2_replace_lane(self, pos, val); } // isnan template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return wasm_v128_or(wasm_f32x4_ne(self, self), wasm_f32x4_ne(self, self)); } template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return wasm_v128_or(wasm_f64x2_ne(self, self), wasm_f64x2_ne(self, self)); } // le template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_le(self, other); } template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_le(self, other); } // load_aligned template XSIMD_INLINE batch load_aligned(float const* mem, convert, requires_arch) noexcept { return wasm_v128_load(mem); } template ::value>> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept { return wasm_v128_load((v128_t const*)mem); } template XSIMD_INLINE batch load_aligned(double const* mem, convert, requires_arch) noexcept { return wasm_v128_load(mem); } // load_complex namespace detail { template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { return { wasm_i32x4_shuffle(hi, lo, 0, 2, 4, 6), wasm_i32x4_shuffle(hi, lo, 1, 3, 5, 7) }; } template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { return { wasm_i64x2_shuffle(hi, lo, 0, 2), wasm_i64x2_shuffle(hi, lo, 1, 3) }; } } // load_unaligned template XSIMD_INLINE batch load_unaligned(float const* mem, convert, requires_arch) noexcept { return wasm_v128_load(mem); } template ::value>> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return wasm_v128_load((v128_t const*)mem); } template XSIMD_INLINE batch load_unaligned(double const* mem, convert, requires_arch) noexcept { return wasm_v128_load(mem); } // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_lt(self, other); } template ::value>> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_lt(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_u32x4_lt(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto xself = wasm_v128_xor(self, wasm_i64x2_splat(std::numeric_limits::lowest())); auto xother = wasm_v128_xor(other, wasm_i64x2_splat(std::numeric_limits::lowest())); v128_t tmp1 = wasm_i64x2_sub(xself, xother); v128_t tmp2 = wasm_v128_xor(xself, xother); v128_t tmp3 = wasm_v128_andnot(xself, xother); v128_t tmp4 = wasm_v128_andnot(tmp1, tmp2); v128_t tmp5 = wasm_v128_or(tmp3, tmp4); v128_t tmp6 = wasm_i32x4_shr(tmp5, 31); return wasm_i32x4_shuffle(tmp6, wasm_i32x4_splat(0), 1, 1, 3, 3); } else { assert(false && "unsupported arch/op combination"); return {}; } } } template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_lt(self, other); } // mask template ::value>> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_bitmask(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_bitmask(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_bitmask(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_bitmask(self); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return wasm_i32x4_bitmask(self); } template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return wasm_i64x2_bitmask(self); } // max template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_pmax(self, other); } template ::value>> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return select(self > other, self, other); } template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_pmax(self, other); } // min template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_pmin(self, other); } template ::value>> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return select(self <= other, self, other); } template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_pmin(self, other); } // mul template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_mul(self, other); } template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_mul(self, other); } // neg template ::value>> XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_neg(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_neg(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_neg(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_neg(self); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return wasm_f32x4_neg(self); } template XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return wasm_f64x2_neg(self); } // neq template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_ne(self, other); } template ::value>> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~(self == other); } template XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_v128_xor(self, other); } template ::value>> XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self == other); } template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_ne(self, other); } template XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return wasm_v128_xor(self, other); } // reciprocal template XSIMD_INLINE batch reciprocal(batch const& self, requires_arch) noexcept { v128_t one = wasm_f32x4_splat(1.0f); return wasm_f32x4_div(one, self); } template XSIMD_INLINE batch reciprocal(batch const& self, requires_arch) noexcept { v128_t one = wasm_f64x2_splat(1.0); return wasm_f64x2_div(one, self); } // reduce_add template XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept { v128_t tmp0 = wasm_f32x4_add(self, wasm_i32x4_shuffle(self, self, 6, 7, 2, 3)); v128_t tmp1 = wasm_i32x4_shuffle(tmp0, tmp0, 1, 0, 4, 4); v128_t tmp2 = wasm_f32x4_add(tmp0, tmp1); v128_t tmp3 = wasm_i32x4_shuffle(tmp0, tmp2, 4, 1, 2, 3); return wasm_f32x4_extract_lane(tmp3, 0); } template ::value>> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); v128_t tmp1 = wasm_i32x4_add(self, tmp0); v128_t tmp2 = wasm_i32x4_shuffle(tmp1, wasm_i32x4_splat(0), 1, 0, 0, 0); v128_t tmp3 = wasm_i32x4_add(tmp1, tmp2); return wasm_i32x4_extract_lane(tmp3, 0); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); v128_t tmp1 = wasm_i64x2_add(self, tmp0); return wasm_i64x2_extract_lane(tmp1, 0); } else { return reduce_add(self, common {}); } } template XSIMD_INLINE double reduce_add(batch const& self, requires_arch) noexcept { v128_t tmp0 = wasm_i64x2_shuffle(self, self, 1, 3); v128_t tmp1 = wasm_f64x2_add(self, tmp0); v128_t tmp2 = wasm_i64x2_shuffle(tmp0, tmp1, 2, 1); return wasm_f64x2_extract_lane(tmp2, 0); } // reduce_mul template XSIMD_INLINE float reduce_mul(batch const& self, requires_arch) noexcept { v128_t tmp0 = wasm_f32x4_mul(self, wasm_i32x4_shuffle(self, self, 6, 7, 2, 3)); v128_t tmp1 = wasm_i32x4_shuffle(tmp0, tmp0, 1, 0, 4, 4); v128_t tmp2 = wasm_f32x4_mul(tmp0, tmp1); v128_t tmp3 = wasm_i32x4_shuffle(tmp0, tmp2, 4, 1, 2, 3); return wasm_f32x4_extract_lane(tmp3, 0); } template ::value>> XSIMD_INLINE T reduce_mul(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); v128_t tmp1 = wasm_i32x4_mul(self, tmp0); v128_t tmp2 = wasm_i32x4_shuffle(tmp1, wasm_i32x4_splat(0), 1, 0, 0, 0); v128_t tmp3 = wasm_i32x4_mul(tmp1, tmp2); return wasm_i32x4_extract_lane(tmp3, 0); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { v128_t tmp0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); v128_t tmp1 = wasm_i64x2_mul(self, tmp0); return wasm_i64x2_extract_lane(tmp1, 0); } else { return reduce_mul(self, common {}); } } template XSIMD_INLINE double reduce_mul(batch const& self, requires_arch) noexcept { v128_t tmp0 = wasm_i64x2_shuffle(self, self, 1, 3); v128_t tmp1 = wasm_f64x2_mul(self, tmp0); v128_t tmp2 = wasm_i64x2_shuffle(tmp0, tmp1, 2, 1); return wasm_f64x2_extract_lane(tmp2, 0); } // rsqrt template XSIMD_INLINE batch rsqrt(batch const& self, requires_arch) noexcept { v128_t one = wasm_f32x4_splat(1.0f); return wasm_f32x4_div(one, wasm_f32x4_sqrt(self)); } template XSIMD_INLINE batch rsqrt(batch const& self, requires_arch) noexcept { v128_t one = wasm_f64x2_splat(1.0); return wasm_f64x2_div(one, wasm_f64x2_sqrt(self)); } // slide_left template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { return wasm_i8x16_shuffle( wasm_i64x2_const(0, 0), x, ((N) & 0xF0) ? 0 : 16 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 17 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 18 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 19 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 20 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 21 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 22 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 23 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 24 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 25 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 26 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 27 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 28 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 29 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 30 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 31 - ((N) & 0xF)); } // slide_right template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { return wasm_i8x16_shuffle( x, wasm_i64x2_const(0, 0), ((N) & 0xF0) ? 16 : ((N) & 0xF) + 0, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 1, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 2, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 3, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 4, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 5, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 6, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 7, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 8, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 9, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 10, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 11, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 12, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 13, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 14, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 15); } // sadd template ::value>> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_add_sat(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_add_sat(self, other); } else { return sadd(self, other, common {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_add_sat(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_add_sat(self, other); } else { return sadd(self, other, common {}); } } } // select template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond)); } template ::value>> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond)); } template ::value>> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, wasm {}); } template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond)); } // shuffle template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept { return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3); } template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept { return wasm_i64x2_shuffle(x, y, I0, I1); } // set template XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return wasm_f32x4_make(values...); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1) noexcept { return wasm_i64x2_make(v0, v1); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept { return wasm_i32x4_make(v0, v1, v2, v3); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept { return wasm_i16x8_make(v0, v1, v2, v3, v4, v5, v6, v7); } template ::value>> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept { return wasm_i8x16_make(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return wasm_f64x2_make(values...); } template ::value>> XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } // ssub template ::value>> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_sub_sat(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_sub_sat(self, other); } else { return ssub(self, other, common {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_u8x16_sub_sat(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_u16x8_sub_sat(self, other); } else { return ssub(self, other, common {}); } } } // store_aligned template XSIMD_INLINE void store_aligned(float* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store(mem, self); } template ::value>> XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store((v128_t*)mem, self); } template ::value>> XSIMD_INLINE void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept { return wasm_v128_store((v128_t*)mem, self); } template XSIMD_INLINE void store_aligned(double* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store(mem, self); } // store_complex namespace detail { // complex_low template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return wasm_i32x4_shuffle(self.real(), self.imag(), 0, 4, 1, 5); } // complex_high template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return wasm_i32x4_shuffle(self.real(), self.imag(), 2, 6, 3, 7); } template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return wasm_i64x2_shuffle(self.real(), self.imag(), 0, 2); } template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return wasm_i64x2_shuffle(self.real(), self.imag(), 1, 3); } } // store_unaligned template XSIMD_INLINE void store_unaligned(float* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store(mem, self); } template ::value>> XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store((v128_t*)mem, self); } template ::value>> XSIMD_INLINE void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept { return wasm_v128_store((v128_t*)mem, self); } template XSIMD_INLINE void store_unaligned(double* mem, batch const& self, requires_arch) noexcept { return wasm_v128_store(mem, self); } // sub template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f32x4_sub(self, other); } template ::value>> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_sub(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_sub(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_sub(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_sub(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return wasm_f64x2_sub(self, other); } // sqrt template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return wasm_f32x4_sqrt(val); } template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return wasm_f64x2_sqrt(val); } // swizzle template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i64x2_shuffle(self, self, V0, V1); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i64x2_shuffle(self, self, V0, V1); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i16x8_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { return wasm_i8x16_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); } // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; auto t0 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5); // r0[0] r1[0] r0[1] r1[1] auto t1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); // r0[2] r1[2] r0[3] r1[3] auto t2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); // r2[0] r3[0] r2[1] r3[1] auto t3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); // r2[2] r3[2] r2[3] r3[3] matrix_begin[0] = wasm_i32x4_shuffle(t0, t2, 0, 1, 4, 5); // r0[0] r1[0] r2[0] r3[0] matrix_begin[1] = wasm_i32x4_shuffle(t0, t2, 2, 3, 6, 7); // r0[1] r1[1] r2[1] r3[1] matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2] matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3] } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto r0 = matrix_begin[0], r1 = matrix_begin[1]; matrix_begin[0] = wasm_i64x2_shuffle(r0, r1, 0, 2); matrix_begin[1] = wasm_i64x2_shuffle(r0, r1, 1, 3); } else { transpose(matrix_begin, matrix_end, common {}); } } // trunc template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return wasm_f32x4_trunc(self); } template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return wasm_f64x2_trunc(self); } // widen template XSIMD_INLINE std::array, 2> widen(batch const& x, requires_arch) noexcept { return { batch(wasm_f64x2_promote_low_f32x4(x)), batch(wasm_f64x2_promote_low_f32x4(wasm_i32x4_shuffle(x, x, 2, 3, 0, 1))) }; } template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(wasm_u16x8_extend_low_u8x16(x)), batch, A>(wasm_u16x8_extend_high_u8x16(x)) }; } template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(wasm_i16x8_extend_low_i8x16(x)), batch, A>(wasm_i16x8_extend_high_i8x16(x)) }; } template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(wasm_u32x4_extend_low_u16x8(x)), batch, A>(wasm_u32x4_extend_high_u16x8(x)) }; } template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(wasm_i32x4_extend_low_i16x8(x)), batch, A>(wasm_i32x4_extend_high_i16x8(x)) }; } template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(wasm_u64x2_extend_low_u32x4(x)), batch, A>(wasm_u64x2_extend_high_u32x4(x)) }; } template XSIMD_INLINE std::array, A>, 2> widen(batch const& x, requires_arch) noexcept { return { batch, A>(wasm_i64x2_extend_low_i32x4(x)), batch, A>(wasm_i64x2_extend_high_i32x4(x)) }; } // zip_hi template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7); } template ::value>> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_shuffle(self, other, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_shuffle(self, other, 4, 12, 5, 13, 6, 14, 7, 15); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_shuffle(self, other, 2, 6, 3, 7); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_shuffle(self, other, 1, 3); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { return wasm_i64x2_shuffle(self, other, 1, 3); } // zip_lo template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5); } template ::value>> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return wasm_i8x16_shuffle(self, other, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return wasm_i16x8_shuffle(self, other, 0, 8, 1, 9, 2, 10, 3, 11); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return wasm_i32x4_shuffle(self, other, 0, 4, 1, 5); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return wasm_i64x2_shuffle(self, other, 0, 2); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { return wasm_i64x2_shuffle(self, other, 0, 2); } } } #endif xtensor-stack-xsimd-541558d/include/xsimd/config/000077500000000000000000000000001517435117100217435ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/include/xsimd/config/xsimd_arch.hpp000066400000000000000000000201761517435117100246030ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ARCH_HPP #define XSIMD_ARCH_HPP #include #include #include #include "../types/xsimd_all_registers.hpp" #include "./xsimd_config.hpp" #include "./xsimd_cpuid.hpp" namespace xsimd { /** * @ingroup architectures * * Dummy architectures that only appears in a list of architecture when no * other architecture has been detected. */ struct unavailable { static constexpr bool supported() noexcept { return false; } static constexpr bool available() noexcept { return false; } static constexpr std::size_t alignment() noexcept { return 0; } static constexpr bool requires_alignment() noexcept { return false; } static constexpr char const* name() noexcept { return ""; } }; namespace detail { // Checks whether T appears in Tys. template struct contains; template struct contains : std::false_type { }; template struct contains : std::conditional_t::value, std::true_type, contains> { }; template XSIMD_INLINE constexpr T max_of(T value) noexcept { return value; } template XSIMD_INLINE constexpr T max_of(T head0, T head1, Ts... tail) noexcept { return max_of((head0 > head1 ? head0 : head1), tail...); } template struct head; template struct head { using type = T; }; template <> struct head<> { using type = unavailable; }; } // namespace detail // An arch_list is a list of architectures. template struct arch_list { using best = typename detail::head::type; template using add = arch_list; template using extend = arch_list; template static constexpr bool contains() noexcept { return detail::contains::value; } template static XSIMD_INLINE void for_each(F&& f) noexcept { (void)std::initializer_list { (f(Archs {}), true)... }; } static constexpr std::size_t alignment() noexcept { // all alignments are a power of two return detail::max_of(Archs::alignment()..., static_cast(0)); } }; namespace detail { // Filter archlists Archs, picking only supported archs and adding // them to L. template struct supported_helper; template struct supported_helper> { using type = L; }; template struct supported_helper> : supported_helper< std::conditional_t, L>, arch_list> { }; template struct supported : supported_helper, Archs...> { }; // Joins all arch_list Archs in a single arch_list. template struct join; template struct join { using type = Arch; }; template struct join, Args...> : join, Args...> { }; } // namespace detail using all_x86_architectures = arch_list< avx512vnni, avx512vbmi2, avx512vbmi, avx512ifma, avx512pf, avx512vnni, avx512bw, avx512er, avx512dq, avx512cd, avx512f, avxvnni, fma3, avx2, fma3, avx, fma4, fma3, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>; using all_sve_architectures = arch_list, detail::sve<256>, detail::sve<128>>; using all_rvv_architectures = arch_list, detail::rvv<256>, detail::rvv<128>>; using all_arm_architectures = typename detail::join, neon64, neon>>::type; using all_power_architectures = arch_list; using all_riscv_architectures = all_rvv_architectures; using all_wasm_architectures = arch_list; using all_s390x_architectures = arch_list; using all_architectures = typename detail::join::type; using supported_architectures = typename detail::supported::type; using x86_arch = typename detail::supported::type::best; using arm_arch = typename detail::supported::type::best; using power_arch = typename detail::supported::type::best; using riscv_arch = typename detail::supported::type::best; using s390x_arch = typename detail::supported::type::best; using best_arch = typename supported_architectures::best; #ifdef XSIMD_DEFAULT_ARCH using default_arch = XSIMD_DEFAULT_ARCH; #else using default_arch = best_arch; #endif namespace detail { template class dispatcher { const decltype(available_architectures()) availables_archs; F functor; template XSIMD_INLINE auto walk_archs(arch_list, Tys&&... args) noexcept { assert(Arch::available() && "At least one arch must be supported during dispatch"); return functor(Arch {}, std::forward(args)...); } template XSIMD_INLINE auto walk_archs(arch_list, Tys&&... args) noexcept { if (availables_archs.has(Arch {})) return functor(Arch {}, std::forward(args)...); else return walk_archs(arch_list {}, std::forward(args)...); } public: XSIMD_INLINE dispatcher(F f) noexcept : availables_archs(available_architectures()) , functor(f) { } template XSIMD_INLINE auto operator()(Tys&&... args) noexcept { return walk_archs(ArchList {}, std::forward(args)...); } }; } // Generic function dispatch, à la ifunc template XSIMD_INLINE detail::dispatcher dispatch(F&& f) noexcept { return { std::forward(f) }; } } // namespace xsimd #endif xtensor-stack-xsimd-541558d/include/xsimd/config/xsimd_config.hpp000066400000000000000000000347571517435117100251450ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_CONFIG_HPP #define XSIMD_CONFIG_HPP #define XSIMD_VERSION_MAJOR 14 #define XSIMD_VERSION_MINOR 2 #define XSIMD_VERSION_PATCH 0 #if defined(__GNUC__) && defined(__BYTE_ORDER__) #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define XSIMD_LITTLE_ENDIAN #endif #elif defined(_WIN32) // We can safely assume that Windows is always little endian #define XSIMD_LITTLE_ENDIAN #elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__) #define XSIMD_LITTLE_ENDIAN #endif /** * high level free functions * * @defgroup xsimd_config_macro Instruction Set Detection */ /** * @ingroup xsimd_config_macro * * Set to 1 if the target is the x86 architecture family. */ #if defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) #define XSIMD_TARGET_X86 1 #else #define XSIMD_TARGET_X86 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if GNU-style inline assembly is available, to 0 otherwise. */ /* Use __clang__ || __GNUC__ for GNU-style inline asm. clang-cl runs in * MSVC-compatibility mode and does not define __GNUC__ by default, but it * still defines __clang__. Clang documents __asm__/__asm__ support and broad * GCC-extension compatibility: * https://clang.llvm.org/docs/LanguageExtensions.html * Clang only emits __GNUC__ when GNUCVersion != 0: * https://raw.githubusercontent.com/llvm/llvm-project/main/clang/lib/Frontend/InitPreprocessor.cpp * and GNUCVersion defaults to 0: * https://raw.githubusercontent.com/llvm/llvm-project/main/clang/include/clang/Basic/LangOptions.def */ #if defined(__clang__) || defined(__GNUC__) #define XSIMD_WITH_INLINE_ASM 1 #else #define XSIMD_WITH_INLINE_ASM 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 when the compiler is allowed to reassociate floating-point * operations (e.g. -ffast-math, -fassociative-math). Detected * automatically from __FAST_MATH__ (GCC/Clang) and __ASSOCIATIVE_MATH__ * (GCC). Clang does not define a macro for standalone * -fassociative-math; users should define XSIMD_REASSOCIATIVE_MATH=1 * manually in that case. */ #ifndef XSIMD_REASSOCIATIVE_MATH #if defined(__FAST_MATH__) || defined(__ASSOCIATIVE_MATH__) #define XSIMD_REASSOCIATIVE_MATH 1 #else #define XSIMD_REASSOCIATIVE_MATH 0 #endif #endif /** * @ingroup xsimd_config_macro * * Set to 1 if SSE2 is available at compile-time, to 0 otherwise. */ #ifdef __SSE2__ #define XSIMD_WITH_SSE2 1 #else #define XSIMD_WITH_SSE2 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if SSE3 is available at compile-time, to 0 otherwise. */ #ifdef __SSE3__ #define XSIMD_WITH_SSE3 1 #else #define XSIMD_WITH_SSE3 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if SSSE3 is available at compile-time, to 0 otherwise. */ #ifdef __SSSE3__ #define XSIMD_WITH_SSSE3 1 #else #define XSIMD_WITH_SSSE3 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if SSE4.1 is available at compile-time, to 0 otherwise. */ #ifdef __SSE4_1__ #define XSIMD_WITH_SSE4_1 1 #else #define XSIMD_WITH_SSE4_1 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if SSE4.2 is available at compile-time, to 0 otherwise. */ #ifdef __SSE4_2__ #define XSIMD_WITH_SSE4_2 1 #else #define XSIMD_WITH_SSE4_2 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX is available at compile-time, to 0 otherwise. */ #ifdef __AVX__ #define XSIMD_WITH_AVX 1 #else #define XSIMD_WITH_AVX 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX2 is available at compile-time, to 0 otherwise. */ #ifdef __AVX2__ #define XSIMD_WITH_AVX2 1 #else #define XSIMD_WITH_AVX2 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVXVNNI is available at compile-time, to 0 otherwise. */ #ifdef __AVXVNNI__ #define XSIMD_WITH_AVXVNNI 1 #else #define XSIMD_WITH_AVXVNNI 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if FMA3 for SSE is available at compile-time, to 0 otherwise. */ #ifdef __FMA__ #if defined(__SSE__) #ifndef XSIMD_WITH_FMA3_SSE // Leave the opportunity to manually disable it, see #643 #define XSIMD_WITH_FMA3_SSE 1 #endif #else #if XSIMD_WITH_FMA3_SSE #error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags" #endif #define XSIMD_WITH_FMA3_SSE 0 #endif #else #if XSIMD_WITH_FMA3_SSE #error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags" #endif #define XSIMD_WITH_FMA3_SSE 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if FMA3 for AVX is available at compile-time, to 0 otherwise. */ #ifdef __FMA__ #if defined(__AVX__) #ifndef XSIMD_WITH_FMA3_AVX // Leave the opportunity to manually disable it, see #643 #define XSIMD_WITH_FMA3_AVX 1 #endif #else #if XSIMD_WITH_FMA3_AVX #error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags" #endif #define XSIMD_WITH_FMA3_AVX 0 #endif #if defined(__AVX2__) #ifndef XSIMD_WITH_FMA3_AVX2 // Leave the opportunity to manually disable it, see #643 #define XSIMD_WITH_FMA3_AVX2 1 #endif #else #if XSIMD_WITH_FMA3_AVX2 #error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags" #endif #define XSIMD_WITH_FMA3_AVX2 0 #endif #else #if XSIMD_WITH_FMA3_AVX #error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags" #endif #if XSIMD_WITH_FMA3_AVX2 #error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags" #endif #define XSIMD_WITH_FMA3_AVX 0 #define XSIMD_WITH_FMA3_AVX2 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if FMA4 is available at compile-time, to 0 otherwise. */ #ifdef __FMA4__ #define XSIMD_WITH_FMA4 1 #else #define XSIMD_WITH_FMA4 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX512F is available at compile-time, to 0 otherwise. */ #ifdef __AVX512F__ // AVX512 instructions are supported starting with gcc 6 // see https://www.gnu.org/software/gcc/gcc-6/changes.html // check clang first, newer clang always defines __GNUC__ = 4 #if defined(__clang__) && __clang_major__ >= 6 #define XSIMD_WITH_AVX512F 1 #elif defined(__GNUC__) && __GNUC__ < 6 #define XSIMD_WITH_AVX512F 0 #else #define XSIMD_WITH_AVX512F 1 #if __GNUC__ == 6 #define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1 #endif #endif #else #define XSIMD_WITH_AVX512F 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX512CD is available at compile-time, to 0 otherwise. */ #ifdef __AVX512CD__ // Avoids repeating the GCC workaround over and over #define XSIMD_WITH_AVX512CD XSIMD_WITH_AVX512F #else #define XSIMD_WITH_AVX512CD 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX512DQ is available at compile-time, to 0 otherwise. */ #ifdef __AVX512DQ__ #define XSIMD_WITH_AVX512DQ XSIMD_WITH_AVX512F #else #define XSIMD_WITH_AVX512DQ 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX512BW is available at compile-time, to 0 otherwise. */ #ifdef __AVX512BW__ #define XSIMD_WITH_AVX512BW XSIMD_WITH_AVX512F #else #define XSIMD_WITH_AVX512BW 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX512ER is available at compile-time, to 0 otherwise. */ #ifdef __AVX512ER__ #define XSIMD_WITH_AVX512ER XSIMD_WITH_AVX512F #else #define XSIMD_WITH_AVX512ER 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX512PF is available at compile-time, to 0 otherwise. */ #ifdef __AVX512PF__ #define XSIMD_WITH_AVX512PF XSIMD_WITH_AVX512F #else #define XSIMD_WITH_AVX512PF 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX512IFMA is available at compile-time, to 0 otherwise. */ #ifdef __AVX512IFMA__ #define XSIMD_WITH_AVX512IFMA XSIMD_WITH_AVX512F #else #define XSIMD_WITH_AVX512IFMA 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX512VBMI is available at compile-time, to 0 otherwise. */ #ifdef __AVX512VBMI__ #define XSIMD_WITH_AVX512VBMI XSIMD_WITH_AVX512F #else #define XSIMD_WITH_AVX512VBMI 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX512VBMI2 is available at compile-time, to 0 otherwise. */ #ifdef __AVX512VBMI2__ #define XSIMD_WITH_AVX512VBMI2 XSIMD_WITH_AVX512F #else #define XSIMD_WITH_AVX512VBMI2 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if AVX512VNNI is available at compile-time, to 0 otherwise. */ #ifdef __AVX512VNNI__ #if XSIMD_WITH_AVX512VBMI2 #define XSIMD_WITH_AVX512VNNI_AVX512VBMI2 XSIMD_WITH_AVX512F #define XSIMD_WITH_AVX512VNNI_AVX512BW XSIMD_WITH_AVX512F #else #define XSIMD_WITH_AVX512VNNI_AVX512VBMI2 0 #define XSIMD_WITH_AVX512VNNI_AVX512BW XSIMD_WITH_AVX512F #endif #else #define XSIMD_WITH_AVX512VNNI_AVX512VBMI2 0 #define XSIMD_WITH_AVX512VNNI_AVX512BW 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if the target is in the ARM architecture family in 64 bits, to 0 otherwise */ #if defined(__aarch64__) || defined(_M_ARM64) #define XSIMD_TARGET_ARM64 1 #else #define XSIMD_TARGET_ARM64 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if the target is in the ARM architecture family, to 0 otherwise */ #if defined(__arm__) || defined(_M_ARM) || XSIMD_TARGET_ARM64 #define XSIMD_TARGET_ARM 1 #else #define XSIMD_TARGET_ARM 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if NEON is available at compile-time, to 0 otherwise. */ #if (defined(__ARM_NEON) && (__ARM_ARCH >= 7)) || XSIMD_TARGET_ARM64 #define XSIMD_WITH_NEON 1 #else #define XSIMD_WITH_NEON 0 #endif // Neon is always available on Arm64, though it is theoritially possible to compile // without it, such as -march=armv8-a+nosimd. // Note that MSVC may never define __ARM_NEON even when available. /** * @ingroup xsimd_config_macro * * Set to 1 if NEON64 is available at compile-time, to 0 otherwise. */ #if XSIMD_TARGET_ARM64 #define XSIMD_WITH_NEON64 1 #else #define XSIMD_WITH_NEON64 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if i8mm neon64 extension is available at compile-time, to 0 otherwise. */ #if defined(__ARM_FEATURE_MATMUL_INT8) #define XSIMD_WITH_I8MM_NEON64 1 #else #define XSIMD_WITH_I8MM_NEON64 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise. */ #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0 #define XSIMD_WITH_SVE 1 #define XSIMD_SVE_BITS __ARM_FEATURE_SVE_BITS #else #define XSIMD_WITH_SVE 0 #define XSIMD_SVE_BITS 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if the target is the RISC-V architecture family. */ #ifdef __riscv #define XSIMD_TARGET_RISCV 1 #else #define XSIMD_TARGET_RISCV 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if RVV is available and bit width is pre-set at compile-time, to 0 otherwise. */ #if defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0 #define XSIMD_WITH_RVV 1 #define XSIMD_RVV_BITS __riscv_v_fixed_vlen #else #define XSIMD_WITH_RVV 0 #define XSIMD_RVV_BITS 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if WebAssembly SIMD is available at compile-time, to 0 otherwise. */ #ifdef __EMSCRIPTEN__ #define XSIMD_WITH_WASM 1 #else #define XSIMD_WITH_WASM 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if the target is in the PowerPC architecture family, to 0 otherwise */ #if defined(__powerpc__) || defined(__powerpc64__) || defined(_ARCH_PPC) || defined(_ARCH_PPC64) #define XSIMD_TARGET_PPC 1 #else #define XSIMD_TARGET_PPC 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if VMX with VSX extension is available at compile-time, to 0 otherwise. */ #if defined(__VEC__) && defined(__VSX__) #define XSIMD_WITH_VSX 1 #else #define XSIMD_WITH_VSX 0 #endif /** * @ingroup xsimd_config_macro * * Set to 1 if the target is in the IBM Z architecture family, to 0 otherwise */ #if defined(__s390x__) #define XSIMD_TARGET_S390X 1 #else #define XSIMD_TARGET_S390X 0 #endif /** + * @ingroup xsimd_config_macro + * + * Set to 1 if s390x VXE is available at compile-time, to 0 otherwise. + * Float vectors have been introduced with VXE included with IBM z14. + */ #if defined(__VEC__) && __VEC__ >= 10304 && __ARCH__ >= 12 #define XSIMD_WITH_VXE 1 #else #define XSIMD_WITH_VXE 0 #endif // Workaround for MSVC compiler #ifdef _MSC_VER #if XSIMD_WITH_AVX512 #undef XSIMD_WITH_AVX2 #define XSIMD_WITH_AVX2 1 #endif #if XSIMD_WITH_AVX2 #undef XSIMD_WITH_AVX #define XSIMD_WITH_AVX 1 #undef XSIMD_WITH_FMA3_AVX #define XSIMD_WITH_FMA3_AVX 1 #undef XSIMD_WITH_FMA3_AVX2 #define XSIMD_WITH_FMA3_AVX2 1 #endif #if XSIMD_WITH_AVX #undef XSIMD_WITH_SSE4_2 #define XSIMD_WITH_SSE4_2 1 #endif #if XSIMD_WITH_SSE4_2 #undef XSIMD_WITH_SSE4_1 #define XSIMD_WITH_SSE4_1 1 #endif #if XSIMD_WITH_SSE4_1 #undef XSIMD_WITH_SSSE3 #define XSIMD_WITH_SSSE3 1 #endif #if XSIMD_WITH_SSSE3 #undef XSIMD_WITH_SSE3 #define XSIMD_WITH_SSE3 1 #endif #if XSIMD_WITH_SSE3 || ((defined(_M_AMD64) || defined(_M_X64)) && !defined(_M_ARM64EC)) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2) #undef XSIMD_WITH_SSE2 #define XSIMD_WITH_SSE2 1 #endif #endif #if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_VSX && !XSIMD_WITH_EMULATED && !XSIMD_WITH_VXE #define XSIMD_NO_SUPPORTED_ARCHITECTURE #endif /** * @ingroup xsimd_config_macro * * Set to 1 if the target is a linux */ #if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18) #define XSIMD_HAVE_LINUX_GETAUXVAL 1 #else #define XSIMD_HAVE_LINUX_GETAUXVAL 0 #endif #endif xtensor-stack-xsimd-541558d/include/xsimd/config/xsimd_cpu_features.hpp000066400000000000000000000041341517435117100263470ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_CPU_FEATURES_HPP #define XSIMD_CPU_FEATURES_HPP #include "./xsimd_cpu_features_arm.hpp" #include "./xsimd_cpu_features_ppc.hpp" #include "./xsimd_cpu_features_riscv.hpp" #include "./xsimd_cpu_features_s390x.hpp" #include "./xsimd_cpu_features_x86.hpp" namespace xsimd { /** * Cross-platform CPU feature detection class. * * All member functions are safe to work on with all platforms. * * @warning This class is *not* thread safe. * Its internal lazy querying structure makes even `const` member function prone to data race. * The structure is also generally not appropriate for directly branching (e.g. on * ``cpu_features::avx2``) because it include a branch that the compiler cannot optimize. * The current appropriate way to use this class for dynamic dispatching is to store the * result of the function calls (e.g. @ref cpu_features) into (static) constants. * This is done in @ref xsimd::available_architectures. * * @see xsimd::dispatch * @see xsimd::available_architectures */ class cpu_features : public s390x_cpu_features, public ppc_cpu_features, public riscv_cpu_features, public arm_cpu_features, public x86_cpu_features { }; } #endif xtensor-stack-xsimd-541558d/include/xsimd/config/xsimd_cpu_features_arm.hpp000066400000000000000000000100361517435117100272040ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ***************************************************************************/ #ifndef XSIMD_CPU_FEATURES_ARM_HPP #define XSIMD_CPU_FEATURES_ARM_HPP #include #include #include "./xsimd_config.hpp" #include "./xsimd_getauxval.hpp" #if XSIMD_TARGET_ARM && XSIMD_HAVE_LINUX_GETAUXVAL // HWCAP_XXX masks to use on getauxval results. // Header does not exists on all architectures and masks are architecture // specific. #include #endif // XSIMD_TARGET_ARM && XSIMD_HAVE_LINUX_GETAUXVAL namespace xsimd { namespace detail { using arm_reg64_t = std::uint64_t; /** * Return the SVE vector length in bytes for the current thread. * * SVE vector length can be restricted * Contrary to `svcntb` this does not require to be compiles with SVE, which * should not be done in a dynamic dispatch jump function. * * Safety: It is the user responsibility to first make sure that SVE is * available. */ inline arm_reg64_t arm_rdvl_unsafe(); } /** * An opinionated CPU feature detection utility for ARM. * * Combines compile-time knowledge with runtime detection when available. * On Linux, runtime detection uses getauxval to query the auxiliary vector. * On other platforms, only compile-time information is used. * * This is well defined on all architectures. * It will always return false on non-ARM architectures. */ class arm_cpu_features : private linux_hwcap_backend_default { public: inline bool neon() const noexcept; inline bool neon64() const noexcept; inline bool sve() const noexcept; inline std::size_t sve_size_bytes() const noexcept; inline bool i8mm() const noexcept; }; /******************** * Implementation * ********************/ namespace detail { #if XSIMD_TARGET_ARM64 && (defined(__GNUC__) || defined(__clang__)) __attribute__((target("arch=armv8-a+sve"))) inline arm_reg64_t arm_rdvl_unsafe() { arm_reg64_t vl; __asm__ volatile("rdvl %0, #1" : "=r"(vl)); return vl; } #else inline arm_reg64_t arm_rdvl_unsafe() { return 0; } #endif } inline bool arm_cpu_features::neon() const noexcept { #if XSIMD_TARGET_ARM && !XSIMD_TARGET_ARM64 && XSIMD_HAVE_LINUX_GETAUXVAL return hwcap().has_feature(HWCAP_NEON); #else return static_cast(XSIMD_WITH_NEON); #endif } inline bool arm_cpu_features::neon64() const noexcept { return static_cast(XSIMD_WITH_NEON64); } inline bool arm_cpu_features::sve() const noexcept { #if XSIMD_TARGET_ARM64 && XSIMD_HAVE_LINUX_GETAUXVAL return hwcap().has_feature(HWCAP_SVE); #else return false; #endif } inline std::size_t arm_cpu_features::sve_size_bytes() const noexcept { if (sve()) { return detail::arm_rdvl_unsafe(); } return 0; } inline bool arm_cpu_features::i8mm() const noexcept { #if XSIMD_TARGET_ARM64 && XSIMD_HAVE_LINUX_GETAUXVAL #ifdef HWCAP2_I8MM return hwcap2().has_feature(HWCAP2_I8MM); #else // Possibly missing on older Linux distributions return hwcap2().has_feature(1 << 13); #endif #else return false; #endif } } #endif xtensor-stack-xsimd-541558d/include/xsimd/config/xsimd_cpu_features_ppc.hpp000066400000000000000000000035251517435117100272140ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ***************************************************************************/ #ifndef XSIMD_CPU_FEATURES_PPC_HPP #define XSIMD_CPU_FEATURES_PPC_HPP #include "./xsimd_config.hpp" #include "./xsimd_getauxval.hpp" namespace xsimd { /** * An opinionated CPU feature detection utility for PowerPC. * * On Linux, runtime detection uses getauxval to query the auxiliary vector. * On other platforms, only compile-time information is used. * * This is well defined on all architectures. * It will always return false on non-PowerPC architectures. */ class ppc_cpu_features : private linux_hwcap_backend_default { public: inline bool vsx() const noexcept; }; /******************** * Implementation * ********************/ inline bool ppc_cpu_features::vsx() const noexcept { #if XSIMD_TARGET_PPC && XSIMD_HAVE_LINUX_GETAUXVAL #ifdef PPC_FEATURE_HAS_VSX return hwcap().has_feature(PPC_FEATURE_HAS_VSX); #else // Possibly missing on older Linux distributions return hwcap().has_feature(0x00000080); #endif #else return XSIMD_WITH_VSX; #endif } } #endif xtensor-stack-xsimd-541558d/include/xsimd/config/xsimd_cpu_features_riscv.hpp000066400000000000000000000055601517435117100275610ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ***************************************************************************/ #ifndef XSIMD_CPU_FEATURES_RISCV_HPP #define XSIMD_CPU_FEATURES_RISCV_HPP #include #include #include "./xsimd_config.hpp" #include "./xsimd_getauxval.hpp" #if XSIMD_TARGET_RISCV && XSIMD_HAVE_LINUX_GETAUXVAL // HWCAP_XXX masks to use on getauxval results. // Header does not exists on all architectures and masks are architecture // specific. #include #endif // XSIMD_TARGET_RISCV && XSIMD_HAVE_LINUX_GETAUXVAL namespace xsimd { namespace detail { using riscv_reg64_t = std::uint64_t; /** * Return the RVV vector length in bytes. * * This does not require to be compiles with SVE, which should not * be done in a dynamic dispatch jump function. * * Safety: It is the user responsibility to first make sure that RVV is * available. */ inline riscv_reg64_t riscv_csrr_unsafe(); } class riscv_cpu_features : private linux_hwcap_backend_default { public: inline bool rvv() const noexcept; inline std::size_t rvv_size_bytes() const noexcept; }; /******************** * Implementation * ********************/ namespace detail { #if XSIMD_TARGET_RISCV && (defined(__GNUC__) || defined(__clang__)) __attribute__((target("arch=+v"))) inline riscv_reg64_t riscv_csrr_unsafe() { riscv_reg64_t vlenb; __asm__ volatile("csrr %0, vlenb" : "=r"(vlenb)); return vlenb; } #else inline riscv_reg64_t riscv_csrr_unsafe() { return 0; } #endif } inline bool riscv_cpu_features::rvv() const noexcept { #if XSIMD_TARGET_RISCV && XSIMD_HAVE_LINUX_GETAUXVAL #ifdef HWCAP_V return hwcap().has_feature(HWCAP_V); #else // Possibly missing on older Linux distributions return hwcap().has_feature(1 << ('V' - 'A')); #endif #else return false; #endif } inline std::size_t riscv_cpu_features::rvv_size_bytes() const noexcept { if (rvv()) { return detail::riscv_csrr_unsafe(); } return 0; } } #endif xtensor-stack-xsimd-541558d/include/xsimd/config/xsimd_cpu_features_s390x.hpp000066400000000000000000000037501517435117100273200ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Andreas Krebbel * * Based on xsimd_cpu_features_ppc.hpp * * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_CPU_FEATURES_S390X_HPP #define XSIMD_CPU_FEATURES_S390X_HPP #include "./xsimd_config.hpp" #include "./xsimd_getauxval.hpp" namespace xsimd { /** * An opinionated CPU feature detection utility for IBM Z. * * On Linux, runtime detection uses getauxval to query the auxiliary vector. * On other platforms, only compile-time information is used. * * This is well defined on all architectures. * It will always return false on non-IBM Z architectures. */ class s390x_cpu_features : private linux_hwcap_backend_default { public: inline bool vxe() const noexcept; }; /******************** * Implementation * ********************/ inline bool s390x_cpu_features::vxe() const noexcept { #if XSIMD_TARGET_S390X && XSIMD_HAVE_LINUX_GETAUXVAL #ifdef HWCAP_S390_VXE return hwcap().has_feature(HWCAP_S390_VXE); #else // Possibly missing on older Linux distributions return hwcap().has_feature(8192); #endif #else return XSIMD_WITH_VXE; #endif } } #endif xtensor-stack-xsimd-541558d/include/xsimd/config/xsimd_cpu_features_x86.hpp000066400000000000000000001153441517435117100270620ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_CPU_FEATURES_X86_HPP #define XSIMD_CPU_FEATURES_X86_HPP #include #include #include #include #include #if __cplusplus >= 201703L #include #endif #include "../utils/bits.hpp" #include "./xsimd_config.hpp" #if XSIMD_TARGET_X86 && defined(_MSC_VER) #include // Contains the definition of __cpuidex #endif namespace xsimd { namespace detail { using x86_reg32_t = std::uint32_t; using cpuid_reg_t = std::array; /** * CPU Identification (CPUID) instruction results. * * The CPUID instruction provides detailed information about the processor, * including supported instruction set extensions (SSE, AVX, AVX-512, etc.). * This function is well defined on all architectures but will return all zeros * on all non-x86 architectures. * * @param leaf The value inputted to the EAX register. * @param subleaf The value inputted to the ECX register. * * @see https://en.wikipedia.org/wiki/CPUID */ inline cpuid_reg_t x86_cpuid(int leaf, int subleaf = 0) noexcept; inline x86_reg32_t x86_xcr0_low() noexcept; /** A strongly type bitset for a 32 bits register. */ template using x86_reg32_bitset = utils::uint_bitset; /** A wrapper to attach a register bitfield descriptor and its CPUID index. */ template struct x86_reg_id { static constexpr x86_reg32_t index = I; using bits = E; static_assert(index >= 0 && index < 4, "At most 4 register in CPUID"); }; /** Find the register id with index k. */ template struct find_reg_k; /** Find the register id with index k (empty / nothing found case). */ template struct find_reg_k { using type = x86_reg_id; }; /** Find the register id with index k (recursive case). */ template struct find_reg_k { using type = std::conditional_t< reg_id_head::index == K, reg_id_head, typename find_reg_k::type>; }; /** * A class with strongly typed bitfield for `CPUID` registers. * * The class stores a variable number of register (up to four) from the CPUID * output. This is a space optimization to avoid storing many zeros in the * final `x86_cpu_features`. * As a result, some of the type aliases `eax`, `ebx`, `ecx`, `edx` may be `void`. */ template class x86_cpuid_regs : private x86_reg32_bitset... { private: static_assert(sizeof...(reg_ids) <= 4, "At most 4 register in CPUID"); /* Parse CPUINFO register value into individual bit components.*/ constexpr explicit x86_cpuid_regs(const cpuid_reg_t& regs) noexcept : x86_reg32_bitset(regs[reg_ids::index])... { } public: static constexpr x86_reg32_t leaf = leaf_num; static constexpr x86_reg32_t subleaf = subleaf_num; using eax = typename find_reg_k<0, reg_ids...>::type::bits; using ebx = typename find_reg_k<1, reg_ids...>::type::bits; using ecx = typename find_reg_k<2, reg_ids...>::type::bits; using edx = typename find_reg_k<3, reg_ids...>::type::bits; inline static x86_cpuid_regs read() { return x86_cpuid_regs(detail::x86_cpuid(leaf, subleaf)); } constexpr x86_cpuid_regs() noexcept = default; // TODO(C++17) compact version for which this was designed. // The else clause contains a very verbose port. #if 0 using x86_reg32_bitset::all_bits_set...; using x86_reg32_bitset::get_range...; #else private: template struct m_empty_reg { enum class type {}; }; using eax_or_empty = std::conditional_t::value, typename m_empty_reg<0>::type, eax>; using ebx_or_empty = std::conditional_t::value, typename m_empty_reg<1>::type, ebx>; using ecx_or_empty = std::conditional_t::value, typename m_empty_reg<2>::type, ecx>; using edx_or_empty = std::conditional_t::value, typename m_empty_reg<3>::type, edx>; public: template ::value, int> = 0> constexpr bool all_bits_set() const noexcept { return x86_reg32_bitset::template all_bits_set(); } template ::value, int> = 0> constexpr x86_reg32_t get_range() const noexcept { return x86_reg32_bitset::template get_range(); } template ::value, int> = 0> constexpr bool all_bits_set() const noexcept { return x86_reg32_bitset::template all_bits_set(); } template ::value, int> = 0> constexpr x86_reg32_t get_range() const noexcept { return x86_reg32_bitset::template get_range(); } template ::value, int> = 0> constexpr bool all_bits_set() const noexcept { return x86_reg32_bitset::template all_bits_set(); } template ::value, int> = 0> constexpr x86_reg32_t get_range() const noexcept { return x86_reg32_bitset::template get_range(); } template ::value, int> = 0> constexpr bool all_bits_set() const noexcept { return x86_reg32_bitset::template all_bits_set(); } template ::value, int> = 0> constexpr x86_reg32_t get_range() const noexcept { return x86_reg32_bitset::template get_range(); } #endif // C++17 }; template struct x86_cpuid_highest_func { private: using x86_reg32_t = detail::x86_reg32_t; using manufacturer_str = std::array; public: static constexpr x86_reg32_t leaf = extended ? 0x80000000 : 0x0; inline static x86_cpuid_highest_func read() { auto regs = detail::x86_cpuid(0); x86_cpuid_highest_func out {}; // Highest function parameter in EAX out.m_highest_leaf = regs[0]; // Manufacturer string in EBX, EDX, ECX (in that order) char* manuf = out.m_manufacturer_id.data(); std::memcpy(manuf + 0 * sizeof(x86_reg32_t), ®s[1], sizeof(x86_reg32_t)); std::memcpy(manuf + 1 * sizeof(x86_reg32_t), ®s[3], sizeof(x86_reg32_t)); std::memcpy(manuf + 2 * sizeof(x86_reg32_t), ®s[2], sizeof(x86_reg32_t)); return out; } constexpr x86_cpuid_highest_func() noexcept = default; /** * Highest available leaf in CPUID non-extended range. * * This is the highest function parameter (EAX) that can be passed to CPUID. * This is valid in the specified range: * - if `extended` is `false`, that is below `0x80000000`, * - if `extended` is `true`, that is above `0x80000000`, */ constexpr x86_reg32_t highest_leaf() const noexcept { return m_highest_leaf; } /** * The manufacturer ID string in a static array. * * This raw character array is case specific and may contain both leading * and trailing whitespaces. * It cannot be assumed to be null terminated. * This is not implemented for all manufacturer when `extended` is `true`. */ constexpr manufacturer_str manufacturer_id_raw() const noexcept { return m_manufacturer_id; } #if __cplusplus >= 201703L constexpr std::string_view manufacturer_id() const noexcept { return { m_manufacturer_id.data(), m_manufacturer_id.size() }; } #endif private: manufacturer_str m_manufacturer_id {}; x86_reg32_t m_highest_leaf {}; }; } /** * Highest CPUID Function Parameter and Manufacturer ID (EAX=0). * * Returns the highest leaf value supported by CPUID in the standard range * (below 0x80000000), and the processor manufacturer ID string. * * @see https://en.wikipedia.org/wiki/CPUID */ using x86_cpuid_leaf0 = detail::x86_cpuid_highest_func; /** * Known processor manufacturer ID strings returned by CPUID leaf 0. * * The 12-byte manufacturer ID is stored in EBX, EDX, ECX (in that order). * Some strings are shared across physical CPUs, emulators, and virtual machines. * Obscure, defunct, and soft-core CPUs are not represented; they map to `unknown`. * * @see https://en.wikipedia.org/wiki/CPUID */ enum class x86_manufacturer { /** * AMD ("AuthenticAMD", "AMD ISBETTER"). * * "AMD ISBETTER" was used by early K5 engineering samples. */ amd, /** * Intel ("GenuineIntel", "GenuineIotel"). * * "GenuineIotel" is a rare typo variant seen on some chips. */ intel, /** * VIA / Centaur ("CentaurHauls", "VIA VIA VIA "). * * Centaur Technology was acquired by VIA in 1999; * older chips report "CentaurHauls", newer ones "VIA VIA VIA ". */ via, /** Zhaoxin (" Shanghai "). */ zhaoxin, /** Hygon ("HygonGenuine"). */ hygon, /** * Transmeta ("TransmetaCPU", "GenuineTMx86"). * * Two different ID strings were used across product lines. */ transmeta, /** MCST Elbrus ("E2K MACHINE "). */ elbrus, /** Microsoft Virtual PC / x86-to-ARM ("Virtual CPU "). */ microsoft_vpc, /** Unrecognized manufacturer ID string. */ unknown, }; /** * Parse a 12-byte CPUID manufacturer ID into an @ref x86_manufacturer value. * * The input is the raw character array returned by @ref x86_cpuid_leaf0::manufacturer_id_raw. * Unrecognized strings map to @ref x86_manufacturer::unknown. */ inline x86_manufacturer x86_parse_manufacturer(const std::array& id) noexcept { auto eq = [&id](const char(&s)[13]) noexcept -> bool { return std::memcmp(id.data(), s, 12) == 0; }; if (eq("GenuineIntel") || eq("GenuineIotel")) return x86_manufacturer::intel; if (eq("AuthenticAMD") || eq("AMD ISBETTER")) return x86_manufacturer::amd; if (eq("CentaurHauls") || eq("VIA VIA VIA ")) return x86_manufacturer::via; if (eq(" Shanghai ")) return x86_manufacturer::zhaoxin; if (eq("HygonGenuine")) return x86_manufacturer::hygon; if (eq("TransmetaCPU") || eq("GenuineTMx86")) return x86_manufacturer::transmeta; if (eq("E2K MACHINE ")) return x86_manufacturer::elbrus; if (eq("Virtual CPU ")) return x86_manufacturer::microsoft_vpc; return x86_manufacturer::unknown; }; /** Return a string representation of an @ref x86_manufacturer value. */ constexpr const char* x86_manufacturer_name(x86_manufacturer m) noexcept { switch (m) { case x86_manufacturer::intel: return "intel"; case x86_manufacturer::amd: return "amd"; case x86_manufacturer::via: return "via"; case x86_manufacturer::zhaoxin: return "zhaoxin"; case x86_manufacturer::hygon: return "hygon"; case x86_manufacturer::transmeta: return "transmeta"; case x86_manufacturer::elbrus: return "elbrus"; case x86_manufacturer::microsoft_vpc: return "microsoft_vpc"; case x86_manufacturer::unknown: return "unknown"; } return "invalid"; } struct x86_cpuid_leaf1_traits { static constexpr detail::x86_reg32_t leaf = 1; static constexpr detail::x86_reg32_t subleaf = 0; enum class ecx { /* Streaming SIMD Extensions 3. */ sse3 = 0, /* Supplemental Streaming SIMD Extensions 3. */ ssse3 = 9, /* Fused multiply-add with 3 operands (FMA3). */ fma3 = 12, /* Streaming SIMD Extensions 4.1. */ sse4_1 = 19, /* Streaming SIMD Extensions 4.2. */ sse4_2 = 20, /* Population count instruction (POPCNT). */ popcnt = 23, /* OS has enabled XSAVE/XRSTOR for extended processor state management. */ osxsave = 27, /* Advanced Vector Extensions (256-bit SIMD). */ avx = 28, }; enum class edx { /* Streaming SIMD Extensions 2. */ sse2 = 26, }; using regs_t = detail::x86_cpuid_regs, detail::x86_reg_id>; }; /** * Processor Info and Feature Bits. * * Utility class that can read and parse the registers for the first leaf level * of the CPUID instruction. * This is well defined on all architectures but will return all false on all * non-x86 architectures. * * @see https://en.wikipedia.org/wiki/CPUID */ using x86_cpuid_leaf1 = typename x86_cpuid_leaf1_traits::regs_t; struct x86_cpuid_leaf7_traits { static constexpr detail::x86_reg32_t leaf = 7; static constexpr detail::x86_reg32_t subleaf = 0; enum class eax { /* Start bit for the encoding of the highest subleaf available. */ highest_subleaf_start = 0, /* End bit for the encoding of the highest subleaf available. */ highest_subleaf_end = 32, }; enum class ebx { /* Bit Manipulation Instruction Set 1. */ bmi1 = 3, /* Advanced Vector Extensions 2 (integer 256-bit SIMD). */ avx2 = 5, /* Bit Manipulation Instruction Set 2. */ bmi2 = 8, /* AVX-512 Foundation instructions. */ avx512f = 16, /* AVX-512 Doubleword and Quadword instructions. */ avx512dq = 17, /* AVX-512 Integer Fused Multiply-Add instructions. */ avx512ifma = 21, /* AVX-512 Prefetch instructions. */ avx512pf = 26, /* AVX-512 Exponential and Reciprocal instructions. */ avx512er = 27, /* AVX-512 Conflict Detection instructions. */ avx512cd = 28, /* AVX-512 Byte and Word instructions. */ avx512bw = 30, }; enum class ecx { /* AVX-512 Vector Bit Manipulation instructions. */ avx512vbmi = 1, /* AVX-512 Vector Bit Manipulation instructions 2. */ avx512vbmi2 = 6, /* AVX-512 Vector Neural Network instructions. */ avx512vnni_bw = 11, }; using regs_t = detail::x86_cpuid_regs, detail::x86_reg_id, detail::x86_reg_id>; }; /** * Extended Feature Bits (EAX=7, ECX=0). * * Utility class that can read and parse the registers for the extended * feature bits leaf of the CPUID instruction. * This is well defined on all architectures but will return all false on all * non-x86 architectures. * * @see https://en.wikipedia.org/wiki/CPUID */ using x86_cpuid_leaf7 = typename x86_cpuid_leaf7_traits::regs_t; struct x86_cpuid_leaf7sub1_traits { static constexpr detail::x86_reg32_t leaf = 7; static constexpr detail::x86_reg32_t subleaf = 1; enum class eax { /* AVX (VEX-encoded) Vector Neural Network instructions. */ avxvnni = 4, }; using regs_t = detail::x86_cpuid_regs>; }; /** * Extended Feature Bits (EAX=7, ECX=1). * * Utility class that can read and parse the registers for the extended * feature bits, subleaf 1, of the CPUID instruction. * This is well defined on all architectures but will return all false on all * non-x86 architectures. * * @see https://en.wikipedia.org/wiki/CPUID */ using x86_cpuid_leaf7sub1 = typename x86_cpuid_leaf7sub1_traits::regs_t; /** * Highest Extended CPUID Function Parameter (EAX=0x80000000). * * Returns the highest leaf value supported by CPUID in the extended range * (at or above 0x80000000), and the processor manufacturer ID string. * * @see https://en.wikipedia.org/wiki/CPUID */ using x86_cpuid_leaf80000000 = detail::x86_cpuid_highest_func; struct x86_cpuid_leaf80000001_traits { static constexpr detail::x86_reg32_t leaf = 0x80000001; static constexpr detail::x86_reg32_t subleaf = 0; enum class ecx { /* AMD Fused multiply-add with 4 operands (FMA4). */ fma4 = 16, }; using regs_t = detail::x86_cpuid_regs>; }; /** * Extended Processor Info and Feature Bits. * * Utility class that can read and parse the registers for the extended * processor info leaf of the CPUID instruction. * This is well defined on all architectures but will return all false on all * non-x86 architectures. * * @see https://en.wikipedia.org/wiki/CPUID */ using x86_cpuid_leaf80000001 = typename x86_cpuid_leaf80000001_traits::regs_t; /* * Extended Control Register 0 (XCR0). * * Operating systems can explicitly disable the usage of instruction set (such * as SSE or AVX extensions) by setting an appropriate flag in XCR0 register. * This utility parses such bit values. * * @see https://docs.kernel.org/admin-guide/hw-vuln/gather_data_sampling.html */ class x86_xcr0 { public: enum class xcr0 { /** x87 FPU/MMX support (must be 1). */ x87 = 0, /** XSAVE support for MXCSR and XMM registers. */ sse = 1, /** AVX enabled and XSAVE support for upper halves of YMM registers. */ avx = 2, /** MPX enabled and XSAVE support for BND0-BND3 registers. */ bndreg = 3, /** MPX enabled and XSAVE support for BNDCFGU and BNDSTATUS registers. */ bndcsr = 4, /** AVX-512 enabled and XSAVE support for opmask registers k0-k7. */ opmask = 5, /** AVX-512 enabled and XSAVE support for upper halves of lower ZMM registers. */ zmm_hi256 = 6, /** AVX-512 enabled and XSAVE support for upper ZMM registers. */ hi16_zmm = 7, /** Saving/restoring Intel Processor Trace state via XSAVE enabled.*/ processor_trace = 8, /** XSAVE support for PKRU register. */ pkru = 9, }; /** * Create a default value with only SSE enabled. * * AVX and AVX512 strictly require OSXSAVE to be enabled by the OS. * If OSXSAVE is disabled (e.g., via bcdedit /set xsavedisable 1), AVX state won't * be preserved across context switches, so AVX cannot be used. * SSE is therefore the only value safe to assume. */ constexpr static x86_xcr0 safe_default() noexcept { x86_reg32_t low = {}; low = utils::make_bit_mask(static_cast(xcr0::sse)); return x86_xcr0(low); } /** * Read the XCR0 register from the CPU if on the correct architecture. * * This is only safe to call if bit 18 of CR4.OSXSAVE has been set. * * @see cpu_id::osxsave */ inline static x86_xcr0 read() { assert(x86_cpuid_leaf1::read().all_bits_set()); return x86_xcr0(detail::x86_xcr0_low()); } template constexpr bool all_bits_set() const noexcept { return m_low.all_bits_set(); } /** Create a value which return false to everything. */ constexpr x86_xcr0() noexcept = default; private: using x86_reg32_t = detail::x86_reg32_t; using xcr0_reg_t = detail::x86_reg32_bitset; /** Parse a XCR0 value into individual components. */ constexpr explicit x86_xcr0(x86_reg32_t low) noexcept : m_low(low) { } xcr0_reg_t m_low {}; }; /** * Orchestrator for `CPUID` calls. * * This class orchestrate `CPUID` and `XCR0` calls so that they are made in the appropriate * order. It also implements lazy calling and cache mechanism around those calls. * Works on all platforms, and return all zeros on non `x86` platforms. */ class x86_cpu_features_backend_cpuid { public: x86_cpu_features_backend_cpuid() noexcept = default; inline x86_xcr0 const& xcr0() const noexcept; inline x86_cpuid_leaf0 const& leaf0() const; inline x86_cpuid_leaf80000000 const& leaf80000000() const; inline x86_cpuid_leaf1 const& leaf1() const; inline x86_cpuid_leaf7 const& leaf7() const; inline x86_cpuid_leaf7sub1 const& leaf7sub1() const; inline x86_cpuid_leaf80000001 const& leaf80000001() const; private: enum class status { leaf0_valid = 0, leaf1_valid = 1, leaf7_valid = 2, leaf7sub1_valid = 3, leaf80000000_valid = 4, leaf80000001_valid = 5, xcr0_valid = 6, }; using status_bitset = utils::uint_bitset; mutable x86_cpuid_leaf0 m_leaf0 {}; mutable x86_cpuid_leaf1 m_leaf1 {}; mutable x86_cpuid_leaf7 m_leaf7 {}; mutable x86_cpuid_leaf7sub1 m_leaf7sub1 {}; mutable x86_cpuid_leaf80000000 m_leaf80000000 {}; mutable x86_cpuid_leaf80000001 m_leaf80000001 {}; mutable x86_xcr0 m_xcr0 {}; mutable status_bitset m_status {}; inline bool osxsave() const noexcept; /** * Internal utility to lazily read and cache a CPUID leaf. * * @tparam status_id The status bit tracking whether this leaf has been read and cached. * @tparam L The CPUID leaf type (e.g. x86_cpuid_leaf1, x86_cpuid_leaf7). * @param leaf_cache A non-const reference to the class member that stores the leaf * value. It must be non-const because this function may write to it on first * call. It is passed explicitly (rather than accessed via `this`) to allow * factoring the caching logic across different leaf members. * @return A const reference to `leaf_cache`. The non-const input / const-ref output * asymmetry is intentional: callers must not modify the cached value, but * this function needs write access to populate it. * * On first call, checks whether the leaf number is within the range advertised as * supported by CPUID (via leaf 0 for the standard range, leaf 0x80000000 for the * extended range). If supported, reads the leaf from the CPU; otherwise leaves * `leaf_cache` at its zero-initialized default (all feature bits false). Either * way, `status_id` is set so subsequent calls return immediately. */ template inline auto const& safe_read_leaf(L& leaf_cache) const; }; /** * No-Op orchestrator for `CPUID` calls * * This does nothing and return zero-constructed objects on all calls. * This is meant as an optimization on non `x86` platforms as the * `x86_cpu_features_backend_cpuid` can be slightly large (hundred of bytes). */ class x86_cpu_features_backend_noop { public: constexpr x86_xcr0 xcr0() const noexcept { return {}; } constexpr x86_cpuid_leaf0 leaf0() const { return {}; } constexpr x86_cpuid_leaf80000000 leaf80000000() const { return {}; } constexpr x86_cpuid_leaf1 leaf1() const { return {}; } constexpr x86_cpuid_leaf7 leaf7() const { return {}; } constexpr x86_cpuid_leaf7sub1 leaf7sub1() const { return {}; } constexpr x86_cpuid_leaf80000001 leaf80000001() const { return {}; } }; #if XSIMD_TARGET_X86 using x86_cpu_features_backend_default = x86_cpu_features_backend_cpuid; #else using x86_cpu_features_backend_default = x86_cpu_features_backend_noop; #endif /** * An opiniated CPU feature detection utility for x86. * * These are high level features that combine multiple registers reads in sequence. * Instead of looking directly at raw CPUID results, this utility also checks that * permissions (e.g. OSXSAVE) are enabled, and otherwise return conservative defaults. * * This is well defined on all architectures. It will always return false on * non-x86 architectures. */ class x86_cpu_features : private x86_cpu_features_backend_default { public: x86_cpu_features() noexcept = default; inline bool sse_enabled() const noexcept { return xcr0().all_bits_set(); } inline bool avx_enabled() const noexcept { // Check both SSE and AVX bits even though AVX must imply SSE return xcr0().all_bits_set(); } inline bool avx512_enabled() const noexcept { // Check all SSE, AVX, optmask, and AVX512 bits even though AVX512 must // imply AVX, SSE, and masked operations. return xcr0().all_bits_set(); } /** * The manufacturer ID string in a static array. * * This raw character array is case specific and may contain both leading * and trailing whitespaces. * It cannot be assumed to be null terminated. */ inline auto manufacturer_id_raw() const noexcept { return leaf0().manufacturer_id_raw(); } #if __cplusplus >= 201703L inline std::string_view manufacturer_id() const noexcept { return leaf0().manufacturer_id(); } #endif /** The manufacturer ID string parsed into known common vendors. */ inline x86_manufacturer known_manufacturer() const noexcept { return x86_parse_manufacturer(manufacturer_id_raw()); } /** * Indicates whether the OS has enabled extended state management. * * When true, the OS has set bit 18 (OSXSAVE) in the CR4 control register, * enabling the XGETBV/XSETBV instructions to access XCR0 and support * processor extended state management using XSAVE/XRSTOR. * * This value is read from CPUID leaf 0x1, ECX bit 27, which reflects * the state of CR4.OSXSAVE. */ inline bool osxsave() const noexcept { return leaf1().all_bits_set(); } inline bool sse2() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } inline bool sse3() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } inline bool ssse3() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } inline bool sse4_1() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } inline bool sse4_2() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } inline bool popcnt() const noexcept { return leaf1().all_bits_set(); } inline bool fma3() const noexcept { return sse_enabled() && leaf1().all_bits_set(); } inline bool avx() const noexcept { return avx_enabled() && leaf1().all_bits_set(); } inline bool bmi1() const noexcept { return leaf7().all_bits_set(); } inline bool avx2() const noexcept { return avx_enabled() && leaf7().all_bits_set(); } inline bool bmi2() const noexcept { return leaf7().all_bits_set(); } inline bool avx512f() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool avx512dq() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool avx512ifma() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool avx512pf() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool avx512er() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool avx512cd() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool avx512bw() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool avx512vbmi() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool avx512vbmi2() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool avx512vnni_bw() const noexcept { return avx512_enabled() && leaf7().all_bits_set(); } inline bool avxvnni() const noexcept { return avx_enabled() && leaf7sub1().all_bits_set(); } inline bool fma4() const noexcept { return avx_enabled() && leaf80000001().all_bits_set(); } }; /******************** * Implementation * ********************/ template inline auto const& x86_cpu_features_backend_cpuid::safe_read_leaf(L& leaf_cache) const { // Check if already initialized if (m_status.bit_is_set()) { return leaf_cache; } // Limit where we need to check leaf0 or leaf 80000000. constexpr auto extended_threshold = x86_cpuid_leaf80000000::leaf; // Check if it is safe to call CPUID with this value. // First we identify if the leaf is in the regular or extended range. // TODO(C++17): if constexpr if (L::leaf < extended_threshold) { // Check leaf0 in regular range if (L::leaf <= leaf0().highest_leaf()) { leaf_cache = L::read(); } } else { // Check leaf80000000 in extended range if (L::leaf <= leaf80000000().highest_leaf()) { leaf_cache = L::read(); } } // Mark as valid in all cases, including if it was not read. // In this case it will be filled with zeros (all false). m_status.set_bit(); return leaf_cache; } inline x86_xcr0 const& x86_cpu_features_backend_cpuid::xcr0() const noexcept { if (!m_status.bit_is_set()) { m_xcr0 = osxsave() ? x86_xcr0::read() : x86_xcr0::safe_default(); m_status.set_bit(); } return m_xcr0; } inline x86_cpuid_leaf0 const& x86_cpu_features_backend_cpuid::leaf0() const { if (!m_status.bit_is_set()) { m_leaf0 = x86_cpuid_leaf0::read(); m_status.set_bit(); } return m_leaf0; } inline x86_cpuid_leaf80000000 const& x86_cpu_features_backend_cpuid::leaf80000000() const { if (!m_status.bit_is_set()) { m_leaf80000000 = x86_cpuid_leaf80000000::read(); m_status.set_bit(); } return m_leaf80000000; } inline x86_cpuid_leaf1 const& x86_cpu_features_backend_cpuid::leaf1() const { return safe_read_leaf(m_leaf1); } inline x86_cpuid_leaf7 const& x86_cpu_features_backend_cpuid::leaf7() const { return safe_read_leaf(m_leaf7); } inline x86_cpuid_leaf7sub1 const& x86_cpu_features_backend_cpuid::leaf7sub1() const { // Check if already initialized if (m_status.bit_is_set()) { return m_leaf7sub1; } // Check if safe to call CPUID with this value as subleaf. constexpr auto start = x86_cpuid_leaf7::eax::highest_subleaf_start; constexpr auto end = x86_cpuid_leaf7::eax::highest_subleaf_end; const auto highest_subleaf7 = leaf7().get_range(); if (x86_cpuid_leaf7sub1::subleaf <= highest_subleaf7) { m_leaf7sub1 = x86_cpuid_leaf7sub1::read(); } // Mark as valid in all cases, including if it was not read. // In this case it will be filled with zeros (all false). m_status.set_bit(); return m_leaf7sub1; } inline x86_cpuid_leaf80000001 const& x86_cpu_features_backend_cpuid::leaf80000001() const { return safe_read_leaf(m_leaf80000001); } inline bool x86_cpu_features_backend_cpuid::osxsave() const noexcept { return leaf1().all_bits_set(); } namespace detail { #if XSIMD_TARGET_X86 inline cpuid_reg_t x86_cpuid(int leaf, int subleaf) noexcept { cpuid_reg_t reg = {}; #if defined(_MSC_VER) int buf[4]; __cpuidex(buf, leaf, subleaf); std::memcpy(reg.data(), buf, sizeof(buf)); // Intel compiler has long had support for `__cpuid`, but only recently for `__cpuidex`. // Modern Clang and GCC also now support `__cpuidex`. // It was decided to keep the inline ASM version for maximum compatibility, as the difference // in ASM is negligible compared to the cost of CPUID. // https://github.com/xtensor-stack/xsimd/pull/1278 #elif XSIMD_WITH_INLINE_ASM #if defined(__i386__) && defined(__PIC__) // %ebx may be the PIC register __asm__("xchg{l}\t{%%}ebx, %1\n\t" "cpuid\n\t" "xchg{l}\t{%%}ebx, %1\n\t" : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) : "0"(leaf), "2"(subleaf)); #else __asm__("cpuid\n\t" : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) : "0"(leaf), "2"(subleaf)); #endif #endif return reg; } inline x86_reg32_t x86_xcr0_low() noexcept { #if defined(_MSC_VER) #if _MSC_VER >= 1400 return static_cast(_xgetbv(0)); #else #error "_MSC_VER < 1400 is not supported" #endif #elif XSIMD_WITH_INLINE_ASM x86_reg32_t xcr0 = {}; __asm__( "xorl %%ecx, %%ecx\n" "xgetbv\n" : "=a"(xcr0) : #if defined(__i386__) : "ecx", "edx" #else : "rcx", "rdx" #endif ); return xcr0; #endif } #else // XSIMD_TARGET_X86 inline cpuid_reg_t x86_cpuid(int /* leaf */, int /* subleaf */) noexcept { return {}; // All bits to zero } inline x86_reg32_t x86_xcr0_low() noexcept { return {}; // All bits to zero } #endif // XSIMD_TARGET_X86 } } #endif xtensor-stack-xsimd-541558d/include/xsimd/config/xsimd_cpuid.hpp000066400000000000000000000124721517435117100247720ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_CPUID_HPP #define XSIMD_CPUID_HPP #include "../types/xsimd_all_registers.hpp" #include "./xsimd_cpu_features.hpp" #include "./xsimd_macros.hpp" namespace xsimd { namespace detail { struct supported_arch { #define ARCH_FIELD_EX(arch, field_name) \ unsigned field_name = 0; \ XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; } #define ARCH_FIELD_EX_REUSE(arch, field_name) \ XSIMD_INLINE bool has(::xsimd::arch) const { return this->field_name; } #define ARCH_FIELD(name) ARCH_FIELD_EX(name, name) ARCH_FIELD(sse2) ARCH_FIELD(sse3) ARCH_FIELD(ssse3) ARCH_FIELD(sse4_1) ARCH_FIELD(sse4_2) // ARCH_FIELD(sse4a) ARCH_FIELD_EX(fma3<::xsimd::sse4_2>, fma3_sse42) ARCH_FIELD(fma4) // ARCH_FIELD(xop) ARCH_FIELD(avx) ARCH_FIELD_EX(fma3<::xsimd::avx>, fma3_avx) ARCH_FIELD(avx2) ARCH_FIELD(avxvnni) ARCH_FIELD_EX(fma3<::xsimd::avx2>, fma3_avx2) ARCH_FIELD(avx512f) ARCH_FIELD(avx512cd) ARCH_FIELD(avx512dq) ARCH_FIELD(avx512bw) ARCH_FIELD(avx512er) ARCH_FIELD(avx512pf) ARCH_FIELD(avx512ifma) ARCH_FIELD(avx512vbmi) ARCH_FIELD(avx512vbmi2) ARCH_FIELD_EX(avx512vnni<::xsimd::avx512bw>, avx512vnni_bw) ARCH_FIELD_EX(avx512vnni<::xsimd::avx512vbmi2>, avx512vnni_vbmi2) ARCH_FIELD(neon) ARCH_FIELD(neon64) ARCH_FIELD_EX(i8mm<::xsimd::neon64>, i8mm_neon64) ARCH_FIELD_EX(detail::sve<512>, sve512) ARCH_FIELD_EX(detail::sve<256>, sve256) ARCH_FIELD_EX(detail::sve<128>, sve128) ARCH_FIELD_EX(detail::rvv<512>, rvv512) ARCH_FIELD_EX(detail::rvv<256>, rvv256) ARCH_FIELD_EX(detail::rvv<128>, rvv128) ARCH_FIELD(wasm) ARCH_FIELD(vsx) ARCH_FIELD(vxe) #undef ARCH_FIELD XSIMD_INLINE supported_arch() noexcept { #if XSIMD_WITH_WASM wasm = 1; #endif const auto cpu = xsimd::cpu_features(); vxe = cpu.vxe(); vsx = cpu.vsx(); rvv128 = cpu.rvv() && (cpu.rvv_size_bytes() >= (128 / 8)); rvv256 = cpu.rvv() && (cpu.rvv_size_bytes() >= (256 / 8)); rvv512 = cpu.rvv() && (cpu.rvv_size_bytes() >= (512 / 8)); neon = cpu.neon(); neon64 = cpu.neon64(); i8mm_neon64 = cpu.neon64() && cpu.i8mm(); // Running SVE128 on a SVE256 machine is more tricky than the x86 equivalent // of running SSE code on an AVX machine and requires to explicitly change the // vector length using `prctl` (per thread setting). // This is something we have not tested and not integrated in xsimd so the safe // default is to assume only one valid SVE width at runtime. sve128 = cpu.sve() && (cpu.sve_size_bytes() * 8 == 128); sve256 = cpu.sve() && (cpu.sve_size_bytes() * 8 == 256); sve512 = cpu.sve() && (cpu.sve_size_bytes() * 8 == 512); sse2 = cpu.sse2(); sse3 = cpu.sse3(); ssse3 = cpu.ssse3(); sse4_1 = cpu.sse4_1(); sse4_2 = cpu.sse4_2(); fma3_sse42 = cpu.fma3(); // sse4a not implemented in cpu_id yet // xop not implemented in cpu_id yet avx = cpu.avx(); fma3_avx = avx && fma3_sse42; fma4 = cpu.fma4(); avx2 = cpu.avx2(); avxvnni = cpu.avxvnni(); fma3_avx2 = avx2 && fma3_sse42; avx512f = cpu.avx512f(); avx512cd = cpu.avx512cd(); avx512dq = cpu.avx512dq(); avx512bw = cpu.avx512bw(); avx512er = cpu.avx512er(); avx512pf = cpu.avx512pf(); avx512ifma = cpu.avx512ifma(); avx512vbmi = cpu.avx512vbmi(); avx512vbmi2 = cpu.avx512vbmi2(); avx512vnni_bw = cpu.avx512vnni_bw(); avx512vnni_vbmi2 = avx512vbmi2 && avx512vnni_bw; } }; } // namespace detail XSIMD_INLINE detail::supported_arch available_architectures() noexcept { static detail::supported_arch supported; return supported; } } #endif xtensor-stack-xsimd-541558d/include/xsimd/config/xsimd_getauxval.hpp000066400000000000000000000110151517435117100256560ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ***************************************************************************/ #ifndef XSIMD_GETAUXVAL_HPP #define XSIMD_GETAUXVAL_HPP #include "../utils/bits.hpp" #include "./xsimd_config.hpp" #if XSIMD_HAVE_LINUX_GETAUXVAL #include // getauxval #endif namespace xsimd { namespace detail { using linux_getauxval_t = unsigned long; inline linux_getauxval_t linux_getauxval(linux_getauxval_t type) noexcept; } /* * Holds the value of a Linux auxiliary vector entry (e.g. AT_HWCAP). * * On Linux systems, the kernel exposes some CPU features through the * auxiliary vector, which can be queried via `getauxval(AT_HWCAP)`. * Well defined on all platforms, and will return always falsw on * non-linux platforms. * * Usage: * auto hwcap = linux_auxval::read(AT_HWCAP); * bool neon = hwcap.has_feature(HWCAP_NEON); * * @see https://www.kernel.org/doc/Documentation/arm64/elf_hwcaps.txt */ class linux_auxval { private: using getauxval_t = detail::linux_getauxval_t; public: constexpr linux_auxval() noexcept = default; inline static linux_auxval read(getauxval_t type) noexcept { return linux_auxval(detail::linux_getauxval(type)); } constexpr bool has_feature(getauxval_t feat) const noexcept { return (m_auxval & feat) == feat; } private: getauxval_t m_auxval = {}; constexpr explicit linux_auxval(getauxval_t v) noexcept : m_auxval(v) { } }; class linux_hwcap_backend { public: inline linux_auxval hwcap() const noexcept; inline linux_auxval hwcap2() const noexcept; private: enum class status { hwcap_valid = 0, hwcap2_valid = 1, }; using status_bitset = utils::uint_bitset; mutable status_bitset m_status {}; mutable xsimd::linux_auxval m_hwcap {}; mutable xsimd::linux_auxval m_hwcap2 {}; }; class linux_hwcap_backend_noop { public: inline linux_auxval hwcap() const noexcept { return {}; } inline linux_auxval hwcap2() const noexcept { return {}; } }; #if XSIMD_HAVE_LINUX_GETAUXVAL using linux_hwcap_backend_default = linux_hwcap_backend; #else // Contrary to CPUID that is only used on one architecture, HWCAP are // available on multiple architectures with different meaning for the // different bit fields. // We use the Linux `HWCAP` constants directly to avoid repetition, so // we could not use a default implementation without already being on // Linux anyways. struct linux_hwcap_backend_default { }; #endif /******************** * Implementation * ********************/ namespace detail { #if XSIMD_HAVE_LINUX_GETAUXVAL inline linux_getauxval_t linux_getauxval(linux_getauxval_t type) noexcept { return getauxval(type); } #else inline linux_getauxval_t linux_getauxval(linux_getauxval_t) noexcept { return {}; // All bits set to 0 } #endif } inline linux_auxval linux_hwcap_backend::hwcap() const noexcept { if (!m_status.bit_is_set()) { #if XSIMD_HAVE_LINUX_GETAUXVAL m_hwcap = linux_auxval::read(AT_HWCAP); #endif m_status.set_bit(); } return m_hwcap; } inline linux_auxval linux_hwcap_backend::hwcap2() const noexcept { if (!m_status.bit_is_set()) { #if XSIMD_HAVE_LINUX_GETAUXVAL m_hwcap2 = linux_auxval::read(AT_HWCAP2); #endif m_status.set_bit(); } return m_hwcap2; } } #endif xtensor-stack-xsimd-541558d/include/xsimd/config/xsimd_macros.hpp000066400000000000000000000052001517435117100251410ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_MACROS_HPP #define XSIMD_MACROS_HPP #if defined(__VEC__) #define XSIMD_INLINE inline #elif defined __has_attribute #if __has_attribute(always_inline) #define XSIMD_INLINE inline __attribute__((always_inline)) #else #define XSIMD_INLINE inline #endif #elif defined(_MSC_VER) #define XSIMD_INLINE inline __forceinline #else #define XSIMD_INLINE inline #endif #define XSIMD_CONCAT_INNER(a, b) a##b #define XSIMD_CONCAT(a, b) XSIMD_CONCAT_INNER(a, b) #if defined(__FAST_MATH__) #define XSIMD_NO_DENORMALS #define XSIMD_NO_INFINITIES #define XSIMD_NO_NANS #endif #if defined(__has_cpp_attribute) // if this check passes, then the compiler supports feature test macros #if __has_cpp_attribute(nodiscard) >= 201603L // if this check passes, then the compiler supports [[nodiscard]] without a message #define XSIMD_NO_DISCARD [[nodiscard]] #endif #endif #if !defined(XSIMD_NO_DISCARD) && __cplusplus >= 201703L // this means that the previous tests failed, but we are using C++17 or higher #define XSIMD_NO_DISCARD [[nodiscard]] #endif #if !defined(XSIMD_NO_DISCARD) && (defined(__GNUC__) || defined(__clang__)) // this means that the previous checks failed, but we are using GCC or Clang #define XSIMD_NO_DISCARD __attribute__((warn_unused_result)) #endif #if !defined(XSIMD_NO_DISCARD) // this means that all the previous checks failed, so we fallback to doing nothing #define XSIMD_NO_DISCARD #endif #ifdef __cpp_if_constexpr // this means that the compiler supports the `if constexpr` construct #define XSIMD_IF_CONSTEXPR if constexpr #endif #if !defined(XSIMD_IF_CONSTEXPR) && __cplusplus >= 201703L // this means that the previous test failed, but we are using C++17 or higher #define XSIMD_IF_CONSTEXPR if constexpr #endif #if !defined(XSIMD_IF_CONSTEXPR) // this means that all the previous checks failed, so we fallback to a normal `if` #define XSIMD_IF_CONSTEXPR if #endif #endif xtensor-stack-xsimd-541558d/include/xsimd/math/000077500000000000000000000000001517435117100214275ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/include/xsimd/math/xsimd_rem_pio2.hpp000066400000000000000000000611451517435117100250670ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include #include namespace xsimd { namespace detail { /* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ #if defined(_MSC_VER) #define ONCE0 \ __pragma(warning(push)) \ __pragma(warning(disable : 4127)) while (0) \ __pragma(warning(pop)) /**/ #else #define ONCE0 while (0) #endif /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ #ifdef XSIMD_LITTLE_ENDIAN #define LOW_WORD_IDX 0 #define HIGH_WORD_IDX sizeof(std::uint32_t) #else #define LOW_WORD_IDX sizeof(std::uint32_t) #define HIGH_WORD_IDX 0 #endif #define GET_HIGH_WORD(i, d) \ do \ { \ double f = (d); \ std::memcpy(&(i), reinterpret_cast(&f) + HIGH_WORD_IDX, \ sizeof(std::uint32_t)); \ } \ ONCE0 \ /**/ #define GET_LOW_WORD(i, d) \ do \ { \ double f = (d); \ std::memcpy(&(i), reinterpret_cast(&f) + LOW_WORD_IDX, \ sizeof(std::uint32_t)); \ } \ ONCE0 \ /**/ #define SET_HIGH_WORD(d, v) \ do \ { \ double f = (d); \ std::uint32_t value = (v); \ std::memcpy(reinterpret_cast(&f) + HIGH_WORD_IDX, \ &value, sizeof(std::uint32_t)); \ (d) = f; \ } \ ONCE0 \ /**/ #define SET_LOW_WORD(d, v) \ do \ { \ double f = (d); \ std::uint32_t value = (v); \ std::memcpy(reinterpret_cast(&f) + LOW_WORD_IDX, \ &value, sizeof(std::uint32_t)); \ (d) = f; \ } \ ONCE0 \ /**/ /* * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2) * double x[],y[]; int e0,nx,prec; int ipio2[]; * * __kernel_rem_pio2 return the last three digits of N with * y = x - N*pi/2 * so that |y| < pi/2. * * The method is to compute the integer (mod 8) and fraction parts of * (2/pi)*x without doing the full multiplication. In general we * skip the part of the product that are known to be a huge integer ( * more accurately, = 0 mod 8 ). Thus the number of operations are * independent of the exponent of the input. * * (2/pi) is represented by an array of 24-bit integers in ipio2[]. * * Input parameters: * x[] The input value (must be positive) is broken into nx * pieces of 24-bit integers in double precision format. * x[i] will be the i-th 24 bit of x. The scaled exponent * of x[0] is given in input parameter e0 (i.e., x[0]*2^e0 * match x's up to 24 bits. * * Example of breaking a double positive z into x[0]+x[1]+x[2]: * e0 = ilogb(z)-23 * z = scalbn(z,-e0) * for i = 0,1,2 * x[i] = floor(z) * z = (z-x[i])*2**24 * * * y[] ouput result in an array of double precision numbers. * The dimension of y[] is: * 24-bit precision 1 * 53-bit precision 2 * 64-bit precision 2 * 113-bit precision 3 * The actual value is the sum of them. Thus for 113-bit * precison, one may have to do something like: * * long double t,w,r_head, r_tail; * t = (long double)y[2] + (long double)y[1]; * w = (long double)y[0]; * r_head = t+w; * r_tail = w - (r_head - t); * * e0 The exponent of x[0] * * nx dimension of x[] * * prec an integer indicating the precision: * 0 24 bits (single) * 1 53 bits (double) * 2 64 bits (extended) * 3 113 bits (quad) * * ipio2[] * integer array, contains the (24*i)-th to (24*i+23)-th * bit of 2/pi after binary point. The corresponding * floating value is * * ipio2[i] * 2^(-24(i+1)). * * External function: * double scalbn(), floor(); * * * Here is the description of some local variables: * * jk jk+1 is the initial number of terms of ipio2[] needed * in the computation. The recommended value is 2,3,4, * 6 for single, double, extended,and quad. * * jz local integer variable indicating the number of * terms of ipio2[] used. * * jx nx - 1 * * jv index for pointing to the suitable ipio2[] for the * computation. In general, we want * ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8 * is an integer. Thus * e0-3-24*jv >= 0 or (e0-3)/24 >= jv * Hence jv = max(0,(e0-3)/24). * * jp jp+1 is the number of terms in PIo2[] needed, jp = jk. * * q[] double array with integral value, representing the * 24-bits chunk of the product of x and 2/pi. * * q0 the corresponding exponent of q[0]. Note that the * exponent for q[i] would be q0-24*i. * * PIo2[] double precision array, obtained by cutting pi/2 * into 24 bits chunks. * * f[] ipio2[] in floating point * * iq[] integer array by breaking up q[] in 24-bits chunk. * * fq[] final product of x*(2/pi) in fq[0],..,fq[jk] * * ih integer. If >0 it indicates q[] is >= 0.5, hence * it also indicates the *sign* of the result. * */ XSIMD_INLINE int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept { static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */ static const double PIo2[] = { 1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */ 7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */ 5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */ 3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */ 1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */ 1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */ 2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */ 2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */ }; static const double zero = 0.0, one = 1.0, two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */ twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */ int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih; double z, fw, f[20], fq[20], q[20]; /* initialize jk*/ jk = init_jk[prec]; jp = jk; /* determine jx,jv,q0, note that 3>q0 */ jx = nx - 1; jv = (e0 - 3) / 24; if (jv < 0) jv = 0; q0 = e0 - 24 * (jv + 1); /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */ j = jv - jx; m = jx + jk; for (i = 0; i <= m; i++, j++) f[i] = (j < 0) ? zero : (double)ipio2[j]; /* compute q[0],q[1],...q[jk] */ for (i = 0; i <= jk; i++) { for (j = 0, fw = 0.0; j <= jx; j++) fw += x[j] * f[jx + i - j]; q[i] = fw; } jz = jk; recompute: /* distill q[] into iq[] reversingly */ for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--) { fw = (double)((int32_t)(twon24 * z)); iq[i] = (int)(z - two24 * fw); z = q[j - 1] + fw; } /* compute n */ z = std::scalbn(z, q0); /* actual value of z */ z -= 8.0 * std::floor(z * 0.125); /* trim off integer >= 8 */ n = (int32_t)z; z -= (double)n; ih = 0; if (q0 > 0) { /* need iq[jz-1] to determine n */ i = (iq[jz - 1] >> (24 - q0)); n += i; iq[jz - 1] -= i << (24 - q0); ih = iq[jz - 1] >> (23 - q0); } else if (q0 == 0) ih = iq[jz - 1] >> 23; else if (z >= 0.5) ih = 2; if (ih > 0) { /* q > 0.5 */ n += 1; carry = 0; for (i = 0; i < jz; i++) { /* compute 1-q */ j = iq[i]; if (carry == 0) { if (j != 0) { carry = 1; iq[i] = 0x1000000 - j; } } else iq[i] = 0xffffff - j; } if (q0 > 0) { /* rare case: chance is 1 in 12 */ switch (q0) { case 1: iq[jz - 1] &= 0x7fffff; break; case 2: iq[jz - 1] &= 0x3fffff; break; } } if (ih == 2) { z = one - z; if (carry != 0) z -= std::scalbn(one, q0); } } /* check if recomputation is needed */ if (z == zero) { j = 0; for (i = jz - 1; i >= jk; i--) j |= iq[i]; if (j == 0) { /* need recomputation */ for (k = 1; iq[jk - k] == 0; k++) ; /* k = no. of terms needed */ for (i = jz + 1; i <= jz + k; i++) { /* add q[jz+1] to q[jz+k] */ f[jx + i] = (double)ipio2[jv + i]; for (j = 0, fw = 0.0; j <= jx; j++) fw += x[j] * f[jx + i - j]; q[i] = fw; } jz += k; goto recompute; } } /* chop off zero terms */ if (z == 0.0) { jz -= 1; q0 -= 24; while (iq[jz] == 0) { jz--; q0 -= 24; } } else { /* break z into 24-bit if necessary */ z = std::scalbn(z, -q0); if (z >= two24) { fw = (double)((int32_t)(twon24 * z)); iq[jz] = (int32_t)(z - two24 * fw); jz += 1; q0 += 24; iq[jz] = (int32_t)fw; } else iq[jz] = (int32_t)z; } /* convert integer "bit" chunk to floating-point value */ fw = scalbn(one, q0); for (i = jz; i >= 0; i--) { q[i] = fw * (double)iq[i]; fw *= twon24; } /* compute PIo2[0,...,jp]*q[jz,...,0] */ for (i = jz; i >= 0; i--) { for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++) fw += PIo2[k] * q[i + k]; fq[jz - i] = fw; } /* compress fq[] into y[] */ switch (prec) { case 0: fw = 0.0; for (i = jz; i >= 0; i--) fw += fq[i]; y[0] = (ih == 0) ? fw : -fw; break; case 1: case 2: fw = 0.0; for (i = jz; i >= 0; i--) fw += fq[i]; y[0] = (ih == 0) ? fw : -fw; fw = fq[0] - fw; for (i = 1; i <= jz; i++) fw += fq[i]; y[1] = (ih == 0) ? fw : -fw; break; case 3: /* painful */ for (i = jz; i > 0; i--) { fw = fq[i - 1] + fq[i]; fq[i] += fq[i - 1] - fw; fq[i - 1] = fw; } for (i = jz; i > 1; i--) { fw = fq[i - 1] + fq[i]; fq[i] += fq[i - 1] - fw; fq[i - 1] = fw; } for (fw = 0.0, i = jz; i >= 2; i--) fw += fq[i]; if (ih == 0) { y[0] = fq[0]; y[1] = fq[1]; y[2] = fw; } else { y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw; } } return n & 7; } XSIMD_INLINE std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept { static const std::int32_t two_over_pi[] = { 0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62, 0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A, 0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129, 0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41, 0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8, 0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF, 0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5, 0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08, 0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3, 0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880, 0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B, }; static const std::int32_t npio2_hw[] = { 0x3FF921FB, 0x400921FB, 0x4012D97C, 0x401921FB, 0x401F6A7A, 0x4022D97C, 0x4025FDBB, 0x402921FB, 0x402C463A, 0x402F6A7A, 0x4031475C, 0x4032D97C, 0x40346B9C, 0x4035FDBB, 0x40378FDB, 0x403921FB, 0x403AB41B, 0x403C463A, 0x403DD85A, 0x403F6A7A, 0x40407E4C, 0x4041475C, 0x4042106C, 0x4042D97C, 0x4043A28C, 0x40446B9C, 0x404534AC, 0x4045FDBB, 0x4046C6CB, 0x40478FDB, 0x404858EB, 0x404921FB, }; /* * invpio2: 53 bits of 2/pi * pio2_1: first 33 bit of pi/2 * pio2_1t: pi/2 - pio2_1 * pio2_2: second 33 bit of pi/2 * pio2_2t: pi/2 - (pio2_1+pio2_2) * pio2_3: third 33 bit of pi/2 * pio2_3t: pi/2 - (pio2_1+pio2_2+pio2_3) */ static const double zero = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */ half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */ two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */ invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */ pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */ pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */ pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */ pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */ pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */ pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */ double z = 0., w, t, r, fn; double tx[3]; std::int32_t e0, i, j, nx, n, ix, hx; std::uint32_t low; GET_HIGH_WORD(hx, x); /* high word of x */ ix = hx & 0x7fffffff; if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */ { y[0] = x; y[1] = 0; return 0; } if (ix < 0x4002d97c) { /* |x| < 3pi/4, special case with n=+-1 */ if (hx > 0) { z = x - pio2_1; if (ix != 0x3ff921fb) { /* 33+53 bit pi is good enough */ y[0] = z - pio2_1t; y[1] = (z - y[0]) - pio2_1t; } else { /* near pi/2, use 33+33+53 bit pi */ z -= pio2_2; y[0] = z - pio2_2t; y[1] = (z - y[0]) - pio2_2t; } return 1; } else { /* negative x */ z = x + pio2_1; if (ix != 0x3ff921fb) { /* 33+53 bit pi is good enough */ y[0] = z + pio2_1t; y[1] = (z - y[0]) + pio2_1t; } else { /* near pi/2, use 33+33+53 bit pi */ z += pio2_2; y[0] = z + pio2_2t; y[1] = (z - y[0]) + pio2_2t; } return -1; } } if (ix <= 0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium_ size */ t = std::fabs(x); n = (std::int32_t)(t * invpio2 + half); fn = (double)n; r = t - fn * pio2_1; w = fn * pio2_1t; /* 1st round good to 85 bit */ if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1])) { y[0] = r - w; /* quick check no cancellation */ } else { std::uint32_t high; j = ix >> 20; y[0] = r - w; GET_HIGH_WORD(high, y[0]); i = j - static_cast((high >> 20) & 0x7ff); if (i > 16) { /* 2nd iteration needed, good to 118 */ t = r; w = fn * pio2_2; r = t - w; w = fn * pio2_2t - ((t - r) - w); y[0] = r - w; GET_HIGH_WORD(high, y[0]); i = j - static_cast((high >> 20) & 0x7ff); if (i > 49) { /* 3rd iteration need, 151 bits acc */ t = r; /* will cover all possible cases */ w = fn * pio2_3; r = t - w; w = fn * pio2_3t - ((t - r) - w); y[0] = r - w; } } } y[1] = (r - y[0]) - w; if (hx < 0) { y[0] = -y[0]; y[1] = -y[1]; return -n; } else return n; } /* * all other (large) arguments */ if (ix >= 0x7ff00000) { /* x is inf or NaN */ y[0] = y[1] = x - x; return 0; } /* set z = scalbn(|x|,ilogb(x)-23) */ GET_LOW_WORD(low, x); SET_LOW_WORD(z, low); e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */ SET_HIGH_WORD(z, static_cast(ix - (e0 << 20))); for (i = 0; i < 2; i++) { tx[i] = (double)((std::int32_t)(z)); z = (z - tx[i]) * two24; } tx[2] = z; nx = 3; while (tx[nx - 1] == zero) nx--; /* skip zero term */ n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi); if (hx < 0) { y[0] = -y[0]; y[1] = -y[1]; return -n; } return n; } } #undef SET_LOW_WORD #undef SET_HIGH_WORD #undef GET_LOW_WORD #undef GET_HIGH_WORD #undef HIGH_WORD_IDX #undef LOW_WORD_IDX #undef ONCE0 } xtensor-stack-xsimd-541558d/include/xsimd/memory/000077500000000000000000000000001517435117100220065ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/include/xsimd/memory/xsimd_aligned_allocator.hpp000066400000000000000000000262541517435117100273770ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ALIGNED_ALLOCATOR_HPP #define XSIMD_ALIGNED_ALLOCATOR_HPP #include #include #include #ifdef _WIN32 #include #else #include #endif #include #include #include "../config/xsimd_arch.hpp" namespace xsimd { /** * @class aligned_allocator * @brief Allocator for aligned memory * * The aligned_allocator class template is an allocator that * performs memory allocation aligned by the specified value. * * @tparam T type of objects to allocate. * @tparam Align alignment in bytes. */ template ::value, common, default_arch>::type::alignment()> class aligned_allocator { public: using value_type = T; using pointer = T*; using const_pointer = const T*; using reference = T&; using const_reference = const T&; using size_type = size_t; using difference_type = ptrdiff_t; static constexpr size_t alignment = Align; template struct rebind { using other = aligned_allocator; }; XSIMD_INLINE aligned_allocator() noexcept; XSIMD_INLINE aligned_allocator(const aligned_allocator& rhs) noexcept; template XSIMD_INLINE aligned_allocator(const aligned_allocator& rhs) noexcept; XSIMD_INLINE ~aligned_allocator(); XSIMD_INLINE pointer address(reference) noexcept; XSIMD_INLINE const_pointer address(const_reference) const noexcept; XSIMD_INLINE pointer allocate(size_type n, const void* hint = 0); XSIMD_INLINE void deallocate(pointer p, size_type n); XSIMD_INLINE size_type max_size() const noexcept; XSIMD_INLINE size_type size_max() const noexcept; template XSIMD_INLINE void construct(U* p, Args&&... args); template XSIMD_INLINE void destroy(U* p); }; template XSIMD_INLINE bool operator==(const aligned_allocator& lhs, const aligned_allocator& rhs) noexcept; template XSIMD_INLINE bool operator!=(const aligned_allocator& lhs, const aligned_allocator& rhs) noexcept; XSIMD_INLINE void* aligned_malloc(size_t size, size_t alignment); XSIMD_INLINE void aligned_free(void* ptr); template XSIMD_INLINE size_t get_alignment_offset(const T* p, size_t size, size_t block_size); /************************************ * aligned_allocator implementation * ************************************/ /** * Default constructor. */ template XSIMD_INLINE aligned_allocator::aligned_allocator() noexcept { } /** * Copy constructor. */ template XSIMD_INLINE aligned_allocator::aligned_allocator(const aligned_allocator&) noexcept { } /** * Extended copy constructor. */ template template XSIMD_INLINE aligned_allocator::aligned_allocator(const aligned_allocator&) noexcept { } /** * Destructor. */ template XSIMD_INLINE aligned_allocator::~aligned_allocator() { } /** * Returns the actual address of \c r even in presence of overloaded \c operator&. * @param r the object to acquire address of. * @return the actual address of \c r. */ template XSIMD_INLINE auto aligned_allocator::address(reference r) noexcept -> pointer { return &r; } /** * Returns the actual address of \c r even in presence of overloaded \c operator&. * @param r the object to acquire address of. * @return the actual address of \c r. */ template XSIMD_INLINE auto aligned_allocator::address(const_reference r) const noexcept -> const_pointer { return &r; } /** * Allocates n * sizeof(T) bytes of uninitialized memory, aligned by \c A. * The alignment may require some extra memory allocation. * @param n the number of objects to allocate storage for. * @param hint unused parameter provided for standard compliance. * @return a pointer to the first byte of a memory block suitably aligned and sufficient to * hold an array of \c n objects of type \c T. */ template XSIMD_INLINE auto aligned_allocator::allocate(size_type n, const void*) -> pointer { pointer res = reinterpret_cast(aligned_malloc(sizeof(T) * n, A)); #if defined(_CPPUNWIND) || defined(__cpp_exceptions) if (res == nullptr) throw std::bad_alloc(); #endif return res; } /** * Deallocates the storage referenced by the pointer p, which must be a pointer obtained by * an earlier call to allocate(). The argument \c n must be equal to the first argument of the call * to allocate() that originally produced \c p; otherwise, the behavior is undefined. * @param p pointer obtained from allocate(). * @param n number of objects earlier passed to allocate(). */ template XSIMD_INLINE void aligned_allocator::deallocate(pointer p, size_type) { aligned_free(p); } /** * Returns the maximum theoretically possible value of \c n, for which the * call allocate(n, 0) could succeed. * @return the maximum supported allocated size. */ template XSIMD_INLINE auto aligned_allocator::max_size() const noexcept -> size_type { return size_type(-1) / sizeof(T); } /** * This method is deprecated, use max_size() instead */ template XSIMD_INLINE auto aligned_allocator::size_max() const noexcept -> size_type { return size_type(-1) / sizeof(T); } /** * Constructs an object of type \c T in allocated uninitialized memory * pointed to by \c p, using placement-new. * @param p pointer to allocated uninitialized memory. * @param args the constructor arguments to use. */ template template XSIMD_INLINE void aligned_allocator::construct(U* p, Args&&... args) { new ((void*)p) U(std::forward(args)...); } /** * Calls the destructor of the object pointed to by \c p. * @param p pointer to the object that is going to be destroyed. */ template template XSIMD_INLINE void aligned_allocator::destroy(U* p) { p->~U(); } /** * @defgroup allocator_comparison Comparison operators */ /** * @ingroup allocator_comparison * Compares two aligned memory allocator for equality. Since allocators * are stateless, return \c true iff A1 == A2. * @param lhs aligned_allocator to compare. * @param rhs aligned_allocator to compare. * @return true if the allocators have the same alignment. */ template XSIMD_INLINE bool operator==(const aligned_allocator& lhs, const aligned_allocator& rhs) noexcept { return lhs.alignment == rhs.alignment; } /** * @ingroup allocator_comparison * Compares two aligned memory allocator for inequality. Since allocators * are stateless, return \c true iff A1 != A2. * @param lhs aligned_allocator to compare. * @param rhs aligned_allocator to compare. * @return true if the allocators have different alignments. */ template XSIMD_INLINE bool operator!=(const aligned_allocator& lhs, const aligned_allocator& rhs) noexcept { return !(lhs == rhs); } /**************************************** * aligned malloc / free implementation * ****************************************/ namespace detail { XSIMD_INLINE void* xaligned_malloc(size_t size, size_t alignment) { assert(((alignment & (alignment - 1)) == 0) && "alignment must be a power of two"); assert((alignment >= sizeof(void*)) && "alignment must be at least the size of a pointer"); void* res = nullptr; #ifdef _WIN32 res = _aligned_malloc(size, alignment); #else if (posix_memalign(&res, alignment, size) != 0) { res = nullptr; } #endif return res; } XSIMD_INLINE void xaligned_free(void* ptr) { #ifdef _WIN32 _aligned_free(ptr); #else free(ptr); #endif } } XSIMD_INLINE void* aligned_malloc(size_t size, size_t alignment) { return detail::xaligned_malloc(size, alignment); } XSIMD_INLINE void aligned_free(void* ptr) { detail::xaligned_free(ptr); } template XSIMD_INLINE size_t get_alignment_offset(const T* p, size_t size, size_t block_size) { // size_t block_size = simd_traits::size; if (block_size == 1) { // The simd_block consists of exactly one scalar so that all // elements of the array // are "well" aligned. return 0; } else if (size_t(p) & (sizeof(T) - 1)) { // The array is not aligned to the size of a single element, so that // no element // of the array is well aligned return size; } else { size_t block_mask = block_size - 1; return std::min( (block_size - ((size_t(p) / sizeof(T)) & block_mask)) & block_mask, size); } } template using default_allocator = std::conditional_t, std::allocator>; } #endif xtensor-stack-xsimd-541558d/include/xsimd/memory/xsimd_alignment.hpp000066400000000000000000000055501517435117100257060ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ALIGNMENT_HPP #define XSIMD_ALIGNMENT_HPP #include "../types/xsimd_utils.hpp" #include "xsimd_aligned_allocator.hpp" namespace xsimd { /** * @struct aligned_mode * @brief tag for load and store of aligned memory. */ struct aligned_mode { }; /** * @struct unaligned_mode * @brief tag for load and store of unaligned memory. */ struct unaligned_mode { }; /** * @struct stream_mode * @brief tag for load and store of aligned non-temporal memory. * * Streaming accesses expect aligned pointers. When no architecture-specific * implementation is available, they fall back to aligned semantics. */ struct stream_mode { }; /*********************** * Allocator alignment * ***********************/ template struct allocator_alignment { using type = unaligned_mode; }; template struct allocator_alignment> { using type = aligned_mode; }; template using allocator_alignment_t = typename allocator_alignment::type; /*********************** * container alignment * ***********************/ template struct container_alignment { using type = unaligned_mode; }; template struct container_alignment> { using type = allocator_alignment_t; }; template using container_alignment_t = typename container_alignment::type; /********************* * alignment checker * *********************/ /** * Checks whether pointer \c ptr is aligned according the alignment * requirements of \c Arch. * @return true if the alignment requirements are met */ template XSIMD_INLINE bool is_aligned(void const* ptr) { return (reinterpret_cast(ptr) % static_cast(Arch::alignment())) == 0; } } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/000077500000000000000000000000001517435117100216425ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_all_registers.hpp000066400000000000000000000037351517435117100264260ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd_fma3_sse_register.hpp" #include "xsimd_fma4_register.hpp" #include "xsimd_sse2_register.hpp" #include "xsimd_sse3_register.hpp" #include "xsimd_sse4_1_register.hpp" #include "xsimd_sse4_2_register.hpp" #include "xsimd_avx2_register.hpp" #include "xsimd_avx_register.hpp" #include "xsimd_avxvnni_register.hpp" #include "xsimd_fma3_avx2_register.hpp" #include "xsimd_fma3_avx_register.hpp" #include "xsimd_avx512vnni_avx512bw_register.hpp" #include "xsimd_avx512vnni_avx512vbmi2_register.hpp" #include "xsimd_avx512ifma_register.hpp" #include "xsimd_avx512vbmi2_register.hpp" #include "xsimd_avx512vbmi_register.hpp" #include "xsimd_avx512er_register.hpp" #include "xsimd_avx512pf_register.hpp" #include "xsimd_avx512bw_register.hpp" #include "xsimd_avx512cd_register.hpp" #include "xsimd_avx512dq_register.hpp" #include "xsimd_avx512f_register.hpp" #include "xsimd_i8mm_neon64_register.hpp" #include "xsimd_neon64_register.hpp" #include "xsimd_neon_register.hpp" #include "xsimd_sve_register.hpp" #include "xsimd_rvv_register.hpp" #include "xsimd_wasm_register.hpp" #include "xsimd_vsx_register.hpp" #include "xsimd_vxe_register.hpp" #if XSIMD_WITH_EMULATED #include "xsimd_emulated_register.hpp" #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_api.hpp000066400000000000000000003216341517435117100243410ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_API_HPP #define XSIMD_API_HPP #include #include #include #include #include "../arch/xsimd_isa.hpp" #include "../types/xsimd_batch.hpp" #include "../types/xsimd_traits.hpp" #include "../utils/xsimd_type_traits.hpp" namespace xsimd { /** * high level free functions * * @defgroup batch_arithmetic Arithmetic operators * @defgroup batch_constant Constant batches * @defgroup batch_cond Conditional operators * @defgroup batch_data_transfer Memory operators * @defgroup batch_math Basic math operators * @defgroup batch_math_extra Extra math operators * @defgroup batch_fp Floating point manipulation * @defgroup batch_rounding Rounding operators * @defgroup batch_conversion Conversion operators * @defgroup batch_complex Complex operators * @defgroup batch_logical Logical operators * @defgroup batch_bitwise Bitwise operators * @defgroup batch_reducers Reducers * @defgroup batch_miscellaneous Miscellaneous * @defgroup batch_trigo Trigonometry * * @defgroup batch_bool_logical Boolean logical operators * @defgroup batch_bool_reducers Boolean reducers */ /** * @ingroup batch_math * * Computes the absolute values of each scalar in the batch \c x. * @param x batch of integer or floating point values. * @return the absolute values of \c x. */ template XSIMD_INLINE batch abs(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::abs(x, A {}); } /** * @ingroup batch_complex * * Computes the absolute values of each complex in the batch \c z. * @param z batch of complex values. * @return the absolute values of \c z. */ template XSIMD_INLINE batch abs(batch, A> const& z) noexcept { detail::static_check_supported_config(); return kernel::abs(z, A {}); } /** * @ingroup batch_arithmetic * * Computes the sum of the batches \c x and \c y. * @param x batch or scalar involved in the addition. * @param y batch or scalar involved in the addition. * @return the sum of \c x and \c y */ template XSIMD_INLINE batch add(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x + y; } /** * @ingroup batch_trigo * * Computes the arc cosine of the batch \c x. * @param x batch of floating point values. * @return the arc cosine of \c x. */ template XSIMD_INLINE batch acos(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::acos(x, A {}); } /** * @ingroup batch_trigo * * Computes the inverse hyperbolic cosine of the batch \c x. * @param x batch of floating point values. * @return the inverse hyperbolic cosine of \c x. */ template XSIMD_INLINE batch acosh(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::acosh(x, A {}); } /** * @ingroup batch_complex * * Computes the argument of the batch \c z. * @param z batch of complex or real values. * @return the argument of \c z. */ template XSIMD_INLINE real_batch_type_t> arg(batch const& z) noexcept { detail::static_check_supported_config(); return kernel::arg(z, A {}); } /** * @ingroup batch_trigo * * Computes the arc sine of the batch \c x. * @param x batch of floating point values. * @return the arc sine of \c x. */ template XSIMD_INLINE batch asin(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::asin(x, A {}); } /** * @ingroup batch_trigo * * Computes the inverse hyperbolic sine of the batch \c x. * @param x batch of floating point values. * @return the inverse hyperbolic sine of \c x. */ template XSIMD_INLINE batch asinh(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::asinh(x, A {}); } /** * @ingroup batch_trigo * * Computes the arc tangent of the batch \c x. * @param x batch of floating point values. * @return the arc tangent of \c x. */ template XSIMD_INLINE batch atan(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::atan(x, A {}); } /** * @ingroup batch_trigo * * Computes the arc tangent of the batch \c x/y, using the signs of the * arguments to determine the correct quadrant. * @param x batch of floating point values. * @param y batch of floating point values. * @return the arc tangent of \c x/y. */ template XSIMD_INLINE batch atan2(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::atan2(x, y, A {}); } /** * @ingroup batch_trigo * * Computes the inverse hyperbolic tangent of the batch \c x. * @param x batch of floating point values. * @return the inverse hyperbolic tangent of \c x. */ template XSIMD_INLINE batch atanh(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::atanh(x, A {}); } /** * @ingroup batch_math * * Computes the average of batches \c x and \c y * @param x batch of T * @param y batch of T * @return the average of elements between \c x and \c y. */ template XSIMD_INLINE batch avg(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::avg(x, y, A {}); } /** * @ingroup batch_math * * Computes the rounded average of batches \c x and \c y * @param x batch of T * @param y batch of T * @return the rounded average of elements between \c x and \c y. */ template XSIMD_INLINE batch avgr(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::avgr(x, y, A {}); } /** * @ingroup batch_conversion * * Perform a static_cast from \c T_in to \c T_out on \c x. * @param x batch_bool of \c T_in * @return \c x cast to \c T_out */ template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& x) noexcept { detail::static_check_supported_config(); detail::static_check_supported_config(); static_assert(batch_bool::size == batch_bool::size, "Casting between incompatibles batch_bool types."); return kernel::batch_bool_cast(x, batch_bool {}, A {}); } /** * @ingroup batch_conversion * * Perform a static_cast from \c T_in to \c T_out on \c x. * @param x batch of \c T_in * @return \c x cast to \c T_out */ template XSIMD_INLINE batch batch_cast(batch const& x) noexcept { detail::static_check_supported_config(); detail::static_check_supported_config(); return kernel::batch_cast(x, batch {}, A {}); } /** * @ingroup batch_miscellaneous * * Computes the bit of sign of \c x * @param x batch of scalar * @return bit of sign of \c x */ template XSIMD_INLINE batch bitofsign(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::bitofsign(x, A {}); } /** * @ingroup batch_bitwise * * Computes the bitwise and of the batches \c x and \c y. * @param x batch involved in the operation. * @param y batch involved in the operation. * @return the result of the bitwise and. */ template XSIMD_INLINE batch bitwise_and(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x & y; } /** * @ingroup batch_bitwise * * Computes the bitwise and of the batches \c x and \c y. * @param x batch involved in the operation. * @param y batch involved in the operation. * @return the result of the bitwise and. */ template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& x, batch_bool const& y) noexcept { detail::static_check_supported_config(); return x & y; } /** * @ingroup batch_bitwise * * Computes the bitwise and not of batches \c x and \c y. * @param x batch involved in the operation. * @param y batch involved in the operation. * @return the result of the bitwise and not. */ template XSIMD_INLINE batch bitwise_andnot(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::bitwise_andnot(x, y, A {}); } /** * @ingroup batch_bool_logical * * Computes the bitwise and not of batches \c x and \c y. * @param x batch involved in the operation. * @param y batch involved in the operation. * @return the result of the bitwise and not. */ template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& x, batch_bool const& y) noexcept { detail::static_check_supported_config(); return kernel::bitwise_andnot(x, y, A {}); } /** * @ingroup batch_conversion * * Perform a reinterpret_cast from \c T_in to \c T_out on \c x. * @param x batch of \c T_in * @return \c x reinterpreted as \c T_out */ template XSIMD_INLINE batch bitwise_cast(batch const& x) noexcept { detail::static_check_supported_config(); detail::static_check_supported_config(); return kernel::bitwise_cast(x, batch {}, A {}); } namespace detail { // Detection for kernel overloads accepting ``batch_constant`` in ``bitwise_lshift`` // directly (or in a parent register function). // The ``batch_constant`` overload is a rare but useful optimization. // Running the detection here is less error prone than to add a fallback to all // architectures. template struct has_bitwise_lshift_batch_const : std::false_type { }; template struct has_bitwise_lshift_batch_const< Arch, Batch, BatchConstant, void_t( std::declval(), std::declval(), Arch {}))>> : std::true_type { }; template XSIMD_INLINE batch bitwise_lshift_batch_const(batch const& x, batch_constant shift, std::true_type) noexcept { // Optimized ``batch_constant`` implementation return kernel::bitwise_lshift(x, shift, Arch {}); } template XSIMD_INLINE batch bitwise_lshift_batch_const(batch const& x, batch_constant shift, std::false_type) noexcept { // Fallback to regular run-time implementation return kernel::bitwise_lshift(x, shift.as_batch(), Arch {}); } } /** * @ingroup batch_bitwise * * Perform a bitwise shift to the left * @param x batch of \c T_in * @param shift scalar amount to shift * @return shifted \c x. */ template XSIMD_INLINE batch bitwise_lshift(batch const& x, int shift) noexcept { detail::static_check_supported_config(); return kernel::bitwise_lshift(x, shift, A {}); } template XSIMD_INLINE batch bitwise_lshift(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::bitwise_lshift(x, A {}); } template XSIMD_INLINE batch bitwise_lshift(batch const& x, batch const& shift) noexcept { detail::static_check_supported_config(); return kernel::bitwise_lshift(x, shift, A {}); } template XSIMD_INLINE batch bitwise_lshift(batch const& x, batch_constant shift) noexcept { detail::static_check_supported_config(); using has_batch_const_impl = detail::has_bitwise_lshift_batch_const; return detail::bitwise_lshift_batch_const(x, shift, has_batch_const_impl {}); } /** * @ingroup batch_bitwise * * Computes the bitwise not of batch \c x. * @param x batch involved in the operation. * @return the result of the bitwise not. */ template XSIMD_INLINE batch bitwise_not(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::bitwise_not(x, A {}); } /** * @ingroup batch_bitwise * * Computes the bitwise not of batch \c x. * @param x batch involved in the operation. * @return the result of the bitwise not. */ template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& x) noexcept { detail::static_check_supported_config(); return kernel::bitwise_not(x, A {}); } /** * @ingroup batch_bitwise * * Computes the bitwise or of the batches \c x and \c y. * @param x scalar or batch of scalars * @param y scalar or batch of scalars * @return the result of the bitwise or. */ template XSIMD_INLINE batch bitwise_or(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x | y; } /** * @ingroup batch_bitwise * * Computes the bitwise or of the batches \c x and \c y. * @param x scalar or batch of scalars * @param y scalar or batch of scalars * @return the result of the bitwise or. */ template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& x, batch_bool const& y) noexcept { detail::static_check_supported_config(); return x | y; } /** * @ingroup batch_bitwise * * Perform a bitwise shift to the right * @param x batch of \c T_in * @param shift scalar amount to shift * @return shifted \c x. */ template XSIMD_INLINE batch bitwise_rshift(batch const& x, int shift) noexcept { detail::static_check_supported_config(); return kernel::bitwise_rshift(x, shift, A {}); } template XSIMD_INLINE batch bitwise_rshift(batch const& x, batch const& shift) noexcept { detail::static_check_supported_config(); return kernel::bitwise_rshift(x, shift, A {}); } template XSIMD_INLINE batch bitwise_rshift(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::bitwise_rshift(x, A {}); } /** * @ingroup batch_bitwise * * Computes the bitwise xor of the batches \c x and \c y. * @param x scalar or batch of scalars * @param y scalar or batch of scalars * @return the result of the bitwise xor. */ template XSIMD_INLINE batch bitwise_xor(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x ^ y; } /** * @ingroup batch_bitwise * * Computes the bitwise xor of the batches \c x and \c y. * @param x scalar or batch of scalars * @param y scalar or batch of scalars * @return the result of the bitwise xor. */ template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& x, batch_bool const& y) noexcept { detail::static_check_supported_config(); return x ^ y; } /** * @ingroup batch_data_transfer * * Creates a batch from the single value \c v. If \c v is a boolean, * this function returns a batch_bool. If you need another type * of batch_bool, please use \c broadcast_as instead. * @param v the value used to initialize the batch * @return a new batch instance */ template XSIMD_INLINE typename kernel::detail::broadcaster::return_type broadcast(T v) noexcept { detail::static_check_supported_config(); return kernel::detail::broadcaster::run(v); } /** * @ingroup batch_data_transfer * * Creates a batch from the single value \c v and * the specified batch value type \c To. * @param v the value used to initialize the batch * @return a new batch instance */ template XSIMD_INLINE simd_return_type broadcast_as(From v) noexcept { detail::static_check_supported_config(); using batch_value_type = typename simd_return_type::value_type; using value_type = std::conditional_t::value, bool, batch_value_type>; return simd_return_type(value_type(v)); } /** * @ingroup batch_math * * Computes the cubic root of the batch \c x. * @param x batch of floating point values. * @return the cubic root of \c x. */ template XSIMD_INLINE batch cbrt(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::cbrt(x, A {}); } /** * @ingroup batch_rounding * * Computes the batch of smallest integer values not less than * scalars in \c x. * @param x batch of floating point values. * @return the batch of smallest integer values not less than \c x. */ template XSIMD_INLINE batch ceil(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::ceil(x, A {}); } /** * @ingroup batch_math * * Clips the values of the batch \c x between those of the batches \c lo and \c hi. * @param x batch of scalar values. * @param lo batch of scalar values. * @param hi batch of scalar values. * @return the result of the clipping. */ template XSIMD_INLINE batch clip(batch const& x, batch const& lo, batch const& hi) noexcept { detail::static_check_supported_config(); return kernel::clip(x, lo, hi, A {}); } /** * @ingroup batch_data_transfer * * Pick elements from \c x selected by \c mask, and append them to the * resulting vector, zeroing the remaining slots */ template XSIMD_INLINE batch compress(batch const& x, batch_bool const& mask) noexcept { detail::static_check_supported_config(); return kernel::compress(x, mask, A {}); } /** * @ingroup batch_complex * * Computes the conjugate of the batch \c z. * @param z batch of complex values. * @return the argument of \c z. */ template XSIMD_INLINE complex_batch_type_t> conj(batch const& z) noexcept { return kernel::conj(z, A {}); } /** * @ingroup batch_miscellaneous * * Computes a value whose absolute value matches * that of \c x, but whose sign bit matches that of \c y. * @param x batch of scalars * @param y batch of scalars * @return batch whose absolute value matches that of \c x, but whose sign bit * matches that of \c y. */ template XSIMD_INLINE batch copysign(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::copysign(x, y, A {}); } /** * @ingroup batch_trigo * * Computes the cosine of the batch \c x. * @param x batch of floating point values. * @return the cosine of \c x. */ template XSIMD_INLINE batch cos(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::cos(x, A {}); } /** * @ingroup batch_trigo * * computes the hyperbolic cosine of the batch \c x. * @param x batch of floating point values. * @return the hyperbolic cosine of \c x. */ template XSIMD_INLINE batch cosh(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::cosh(x, A {}); } /** * @ingroup batch_bool_reducers * * Count the number of values set to true in the batch \c x * @param x boolean or batch of boolean * @return the result of the counting. */ template XSIMD_INLINE size_t count(batch_bool const& x) noexcept { detail::static_check_supported_config(); return kernel::count(x, A {}); } /** * @ingroup batch_arithmetic * * Subtract 1 to batch \c x. * @param x batch involved in the decrement. * @return the subtraction of \c x and 1. */ template XSIMD_INLINE batch decr(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::decr(x, A {}); } /** * @ingroup batch_arithmetic * * Subtract 1 to batch \c x for each element where \c mask is true. * @param x batch involved in the increment. * @param mask whether to perform the increment or not. Can be a \c * batch_bool or a \c batch_bool_constant. * @return the subtraction of \c x and 1 when \c mask is true. */ template XSIMD_INLINE batch decr_if(batch const& x, Mask const& mask) noexcept { detail::static_check_supported_config(); return kernel::decr_if(x, mask, A {}); } /** * @ingroup batch_arithmetic * * Computes the division of the batch \c x by the batch \c y. * @param x scalar or batch of scalars * @param y scalar or batch of scalars * @return the result of the division. */ template XSIMD_INLINE batch div(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x / y; } /** * @ingroup batch_logical * * Element-wise equality comparison of batches \c x and \c y. * @param x batch of scalars * @param y batch of scalars * @return a boolean batch. */ template XSIMD_INLINE auto eq(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x == y; } /** * @ingroup batch_logical * * Element-wise equality comparison of batches of boolean values \c x and \c y. * @param x batch of booleans involved in the comparison. * @param y batch of booleans involved in the comparison. * @return a boolean batch. */ template XSIMD_INLINE batch_bool eq(batch_bool const& x, batch_bool const& y) noexcept { detail::static_check_supported_config(); return x == y; } /** * @ingroup batch_math * * Computes the natural exponential of the batch \c x. * @param x batch of floating point values. * @return the natural exponential of \c x. */ template XSIMD_INLINE batch exp(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::exp(x, A {}); } /** * @ingroup batch_math * * Computes the base 10 exponential of the batch \c x. * @param x batch of floating point values. * @return the base 10 exponential of \c x. */ template XSIMD_INLINE batch exp10(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::exp10(x, A {}); } /** * @ingroup batch_math * * Computes the base 2 exponential of the batch \c x. * @param x batch of floating point values. * @return the base 2 exponential of \c x. */ template XSIMD_INLINE batch exp2(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::exp2(x, A {}); } /** * @ingroup batch_data_transfer * * Load contiguous elements from \c x and place them in slots selected by \c * mask, zeroing the other slots */ template XSIMD_INLINE batch expand(batch const& x, batch_bool const& mask) noexcept { detail::static_check_supported_config(); return kernel::expand(x, mask, A {}); } /** * @ingroup batch_math * * Computes the natural exponential of the batch \c x, minus one. * @param x batch of floating point values. * @return the natural exponential of \c x, minus one. */ template XSIMD_INLINE batch expm1(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::expm1(x, A {}); } /** * @ingroup batch_math_extra * * Computes the error function of the batch \c x. * @param x batch of floating point values. * @return the error function of \c x. */ template XSIMD_INLINE batch erf(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::erf(x, A {}); } /** * @ingroup batch_math_extra * * Computes the complementary error function of the batch \c x. * @param x batch of floating point values. * @return the error function of \c x. */ template XSIMD_INLINE batch erfc(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::erfc(x, A {}); } /** * Extract vector from pair of vectors * extracts the lowest vector elements from the second source \c x * and the highest vector elements from the first source \c y * Concatenates the results into th Return value. * @param x batch of integer or floating point values. * @param y batch of integer or floating point values. * @param i integer specifying the lowest vector element to extract from the first source register * @return. */ template XSIMD_INLINE batch extract_pair(batch const& x, batch const& y, std::size_t i) noexcept { detail::static_check_supported_config(); return kernel::extract_pair(x, y, i, A {}); } /** * @ingroup batch_math * * Computes the absolute values of each scalar in the batch \c x. * @param x batch floating point values. * @return the absolute values of \c x. */ template XSIMD_INLINE batch fabs(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::abs(x, A {}); } /** * @ingroup batch_math * * Computes the positive difference between \c x and \c y, that is, * max(0, x-y). * @param x batch of floating point values. * @param y batch of floating point values. * @return the positive difference. */ template XSIMD_INLINE batch fdim(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::fdim(x, y, A {}); } /** * @ingroup batch_rounding * * Computes the batch of largest integer values not greater than * scalars in \c x. * @param x batch of floating point values. * @return the batch of largest integer values not greater than \c x. */ template XSIMD_INLINE batch floor(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::floor(x, A {}); } /** * @ingroup batch_arithmetic * * Computes (x*y) + z in a single instruction when possible. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @param z a batch of integer or floating point values. * @return the result of the fused multiply-add operation. */ template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z) noexcept { detail::static_check_supported_config(); return kernel::fma(x, y, z, A {}); } /** * @ingroup batch_math * * Computes the larger values of the batches \c x and \c y. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @return a batch of the larger values. */ template XSIMD_INLINE batch fmax(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::max(x, y, A {}); } /** * @ingroup batch_math * * Computes the smaller values of the batches \c x and \c y. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @return a batch of the smaller values. */ template XSIMD_INLINE batch fmin(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::min(x, y, A {}); } /** * @ingroup batch_math * * Computes the modulo of the batch \c x by the batch \c y. * @param x batch involved in the modulo. * @param y batch involved in the modulo. * @return the result of the modulo. */ template XSIMD_INLINE batch fmod(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::fmod(x, y, A {}); } /** * @ingroup batch_arithmetic * * Computes (x*y) - z in a single instruction when possible. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @param z a batch of integer or floating point values. * @return the result of the fused multiply-sub operation. */ template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z) noexcept { detail::static_check_supported_config(); return kernel::fms(x, y, z, A {}); } /** * @ingroup batch_arithmetic * * Computes -(x*y) + z in a single instruction when possible. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @param z a batch of integer or floating point values. * @return the result of the fused negated multiply-add operation. */ template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z) noexcept { detail::static_check_supported_config(); return kernel::fnma(x, y, z, A {}); } /** * @ingroup batch_arithmetic * * Computes -(x*y) - z in a single instruction when possible. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @param z a batch of integer or floating point values. * @return the result of the fused negated multiply-sub operation. */ template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z) noexcept { detail::static_check_supported_config(); return kernel::fnms(x, y, z, A {}); } /** * @ingroup batch_arithmetic * * Computes -(x*y) - z in a single instruction when possible. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @param z a batch of integer or floating point values. * @return a batch where each even-indexed element is computed as x * y - z and each odd-indexed element as x * y + z */ template XSIMD_INLINE batch fmas(batch const& x, batch const& y, batch const& z) noexcept { detail::static_check_supported_config(); return kernel::fmas(x, y, z, A {}); } /** * @ingroup batch_fp * * Split split the number x into a normalized fraction and an exponent which is stored in exp * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @return the normalized fraction of x */ template XSIMD_INLINE batch frexp(const batch& x, batch, A>& y) noexcept { detail::static_check_supported_config(); return kernel::frexp(x, y, A {}); } /** * @ingroup batch_logical * * Element-wise greater or equal comparison of batches \c x and \c y. * @tparam X the actual type of batch. * @param x batch involved in the comparison. * @param y batch involved in the comparison. * @return a boolean batch. */ template XSIMD_INLINE batch_bool ge(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x >= y; } /** * @ingroup batch_logical * * Element-wise greater than comparison of batches \c x and \c y. * @tparam X the actual type of batch. * @param x batch involved in the comparison. * @param y batch involved in the comparison. * @return a boolean batch. */ template XSIMD_INLINE batch_bool gt(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x > y; } /** * @ingroup batch_data_transfer * * Extract the scalar element at compile-time index \c I from batch \c b. * @param b the batch to extract from. * @return the scalar element at index \c I. */ template XSIMD_INLINE T get(batch const& b) noexcept { static_assert(I < batch::size, "index out of bounds"); detail::static_check_supported_config(); return kernel::get(b, index {}, A {}); } template XSIMD_INLINE bool get(batch_bool const& b) noexcept { static_assert(I < batch_bool::size, "index out of bounds"); detail::static_check_supported_config(); return kernel::get(b, index {}, A {}); } template XSIMD_INLINE typename batch, A>::value_type get(batch, A> const& b) noexcept { static_assert(I < batch, A>::size, "index out of bounds"); detail::static_check_supported_config(); return kernel::get(b, index {}, A {}); } /** * @ingroup batch_reducers * * Parallel horizontal addition: adds the scalars of each batch * in the array pointed by \c row and store them in a returned * batch. * @param row an array of \c N batches * @return the result of the reduction. */ template XSIMD_INLINE batch haddp(batch const* row) noexcept { detail::static_check_supported_config(); return kernel::haddp(row, A {}); } /** * @ingroup batch_math * * Computes the square root of the sum of the squares of the batches * \c x, and \c y. * @param x batch of floating point values. * @param y batch of floating point values. * @return the square root of the sum of the squares of \c x and \c y. */ template XSIMD_INLINE batch hypot(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::hypot(x, y, A {}); } /** * @ingroup batch_complex * * Computes the imaginary part of the batch \c x. * @param x batch of complex or real values. * @return the argument of \c x. */ template XSIMD_INLINE real_batch_type_t> imag(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::imag(x, A {}); } /** * @ingroup batch_arithmetic * * Add 1 to batch \c x. * @param x batch involved in the increment. * @return the sum of \c x and 1. */ template XSIMD_INLINE batch incr(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::incr(x, A {}); } /** * @ingroup batch_arithmetic * * Add 1 to batch \c x for each element where \c mask is true. * @param x batch involved in the increment. * @param mask whether to perform the increment or not. Can be a \c * batch_bool or a \c batch_bool_constant. * @return the sum of \c x and 1 when \c mask is true. */ template XSIMD_INLINE batch incr_if(batch const& x, Mask const& mask) noexcept { detail::static_check_supported_config(); return kernel::incr_if(x, mask, A {}); } #ifndef __FAST_MATH__ /** * @ingroup batch_constant * * Return a batch of scalars representing positive infinity * @return a batch of positive infinity */ template XSIMD_INLINE B infinity() { using T = typename B::value_type; using A = typename B::arch_type; detail::static_check_supported_config(); return B(std::numeric_limits::infinity()); } #endif /** * @ingroup batch_data_transfer * * Create a new batch equivalent to \c x but with element \c val set at position \c pos * @param x batch * @param val value to set * @param pos index of the updated slot * @return copy of \c x with position \c pos set to \c val */ template XSIMD_INLINE batch insert(batch const& x, T val, index pos) noexcept { detail::static_check_supported_config(); return kernel::insert(x, val, pos, A {}); } /** * @ingroup batch_logical * * Determines if the scalars in the given batch \c x represent an even integer value * @param x batch of floating point values. * @return a batch of booleans. */ template XSIMD_INLINE batch_bool is_even(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::is_even(x, A {}); } /** * @ingroup batch_logical * * Determines if the floating-point scalars in the given batch \c x represent integer value * @param x batch of floating point values. * @return a batch of booleans. */ template XSIMD_INLINE batch_bool is_flint(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::is_flint(x, A {}); } /** * @ingroup batch_logical * * Determines if the scalars in the given batch \c x represent an odd integer value * @param x batch of floating point values. * @return a batch of booleans. */ template XSIMD_INLINE batch_bool is_odd(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::is_odd(x, A {}); } /** * @ingroup batch_logical * * Determines if the scalars in the given batch \c x are inf values. * @param x batch of floating point values. * @return a batch of booleans. */ template XSIMD_INLINE typename batch::batch_bool_type isinf(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::isinf(x, A {}); } /** * @ingroup batch_logical * * Determines if the scalars in the given batch \c x are finite values. * @param x batch of floating point values. * @return a batch of booleans. */ template XSIMD_INLINE typename batch::batch_bool_type isfinite(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::isfinite(x, A {}); } /** * @ingroup batch_logical * * Determines if the scalars in the given batch \c x are NaN values. * @param x batch of floating point values. * @return a batch of booleans. */ template XSIMD_INLINE typename batch::batch_bool_type isnan(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::isnan(x, A {}); } /** * @ingroup batch_math_extra * * Computes the multiplication of the floating point number \c x by 2 raised to the power \c y. * @param x batch of floating point values. * @param y batch of integer values. * @return a batch of floating point values. */ template XSIMD_INLINE batch ldexp(const batch& x, const batch, A>& y) noexcept { detail::static_check_supported_config(); return kernel::ldexp(x, y, A {}); } /** * @ingroup batch_logical * * Element-wise lesser or equal to comparison of batches \c x and \c y. * @param x batch involved in the comparison. * @param y batch involved in the comparison. * @return a boolean batch. */ template XSIMD_INLINE batch_bool le(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x <= y; } /** * @ingroup batch_math_extra * * Computes the natural logarithm of the gamma function of the batch \c x. * @param x batch of floating point values. * @return the natural logarithm of the gamma function of \c x. */ template XSIMD_INLINE batch lgamma(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::lgamma(x, A {}); } /** * @ingroup batch_data_transfer * * Creates a batch from the buffer \c ptr and the specifed * batch value type \c To. The memory needs to be aligned. * @param ptr the memory buffer to read * @return a new batch instance */ template XSIMD_INLINE simd_return_type load_as(From const* ptr, aligned_mode) noexcept { using batch_value_type = typename simd_return_type::value_type; detail::static_check_supported_config(); detail::static_check_supported_config(); return kernel::load_aligned(ptr, kernel::convert {}, A {}); } template XSIMD_INLINE simd_return_type load_as(bool const* ptr, aligned_mode) noexcept { detail::static_check_supported_config(); return simd_return_type::load_aligned(ptr); } template XSIMD_INLINE simd_return_type, To, A> load_as(std::complex const* ptr, aligned_mode) noexcept { detail::static_check_supported_config(); using batch_value_type = typename simd_return_type, To, A>::value_type; return kernel::load_complex_aligned(ptr, kernel::convert {}, A {}); } template XSIMD_INLINE simd_return_type load_as(From const* ptr, stream_mode) noexcept { using batch_value_type = typename simd_return_type::value_type; detail::static_check_supported_config(); detail::static_check_supported_config(); return kernel::load_stream(ptr, kernel::convert {}, A {}); } template XSIMD_INLINE simd_return_type load_as(bool const* ptr, stream_mode) noexcept { detail::static_check_supported_config(); return simd_return_type::load_stream(ptr); } template XSIMD_INLINE simd_return_type, To, A> load_as(std::complex const* ptr, stream_mode) noexcept { detail::static_check_supported_config(); using batch_value_type = typename simd_return_type, To, A>::value_type; return kernel::load_complex_stream(ptr, kernel::convert {}, A {}); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE simd_return_type, To, A> load_as(xtl::xcomplex const* ptr, aligned_mode) noexcept { detail::static_check_supported_config(); detail::static_check_supported_config(); return load_as(reinterpret_cast const*>(ptr), aligned_mode()); } template XSIMD_INLINE simd_return_type, To, A> load_as(xtl::xcomplex const* ptr, stream_mode) noexcept { detail::static_check_supported_config(); detail::static_check_supported_config(); return load_as(reinterpret_cast const*>(ptr), stream_mode()); } #endif /** * @ingroup batch_data_transfer * * Creates a batch from the buffer \c ptr and the specifed * batch value type \c To. The memory does not need to be aligned. * @param ptr the memory buffer to read * @return a new batch instance */ template XSIMD_INLINE simd_return_type load_as(From const* ptr, unaligned_mode) noexcept { using batch_value_type = typename simd_return_type::value_type; detail::static_check_supported_config(); detail::static_check_supported_config(); return kernel::load_unaligned(ptr, kernel::convert {}, A {}); } template XSIMD_INLINE simd_return_type load_as(bool const* ptr, unaligned_mode) noexcept { return simd_return_type::load_unaligned(ptr); } template XSIMD_INLINE simd_return_type, To, A> load_as(std::complex const* ptr, unaligned_mode) noexcept { detail::static_check_supported_config(); detail::static_check_supported_config(); using batch_value_type = typename simd_return_type, To, A>::value_type; return kernel::load_complex_unaligned(ptr, kernel::convert {}, A {}); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE simd_return_type, To, A> load_as(xtl::xcomplex const* ptr, unaligned_mode) noexcept { detail::static_check_supported_config(); detail::static_check_supported_config(); return load_as(reinterpret_cast const*>(ptr), unaligned_mode()); } #endif /** * @ingroup batch_data_transfer * * Creates a batch from the buffer \c ptr. The * memory needs to be aligned. * @param ptr the memory buffer to read * @return a new batch instance */ template XSIMD_INLINE batch load(From const* ptr, aligned_mode = {}) noexcept { detail::static_check_supported_config(); return load_as(ptr, aligned_mode {}); } /** * @ingroup batch_data_transfer * * Creates a batch from the buffer \c ptr. The * memory does not need to be aligned. * @param ptr the memory buffer to read * @return a new batch instance */ template XSIMD_INLINE batch load(From const* ptr, unaligned_mode) noexcept { detail::static_check_supported_config(); return load_as(ptr, unaligned_mode {}); } template XSIMD_INLINE batch load(From const* ptr, stream_mode) noexcept { detail::static_check_supported_config(); return load_as(ptr, stream_mode {}); } /** * @ingroup batch_data_transfer * * Creates a batch from the buffer \c ptr using a mask. Elements * corresponding to \c false in the mask are not accessed in memory and are * zero-initialized in the resulting batch. * @param ptr the memory buffer to read * @param mask selection mask for the elements to load * @return a new batch instance */ template XSIMD_INLINE batch load(From const* ptr, batch_bool_constant const& mask, aligned_mode = {}) noexcept { detail::static_check_supported_config(); return batch::load(ptr, mask, aligned_mode {}); } /** * @ingroup batch_data_transfer * * Creates a batch from the buffer \c ptr using a mask. Elements * corresponding to \c false in the mask are not accessed in memory and are * zero-initialized in the resulting batch. * @param ptr the memory buffer to read. The buffer does not need to be * aligned. * @param mask selection mask for the elements to load * @return a new batch instance */ template XSIMD_INLINE batch load(From const* ptr, batch_bool_constant const& mask, unaligned_mode) noexcept { detail::static_check_supported_config(); return batch::load(ptr, mask, unaligned_mode {}); } /** * @ingroup batch_data_transfer * * Creates a batch from the buffer \c ptr. The * memory needs to be aligned. * @param ptr the memory buffer to read * @return a new batch instance */ template XSIMD_INLINE batch load_aligned(From const* ptr) noexcept { detail::static_check_supported_config(); return load_as(ptr, aligned_mode {}); } /** * @ingroup batch_data_transfer * * Creates a batch from the buffer \c ptr. The * memory does not need to be aligned. * @param ptr the memory buffer to read * @return a new batch instance */ template XSIMD_INLINE batch load_unaligned(From const* ptr) noexcept { detail::static_check_supported_config(); return load_as(ptr, unaligned_mode {}); } /** * @ingroup batch_math * * Computes the natural logarithm of the batch \c x. * @param x batch of floating point values. * @return the natural logarithm of \c x. */ template XSIMD_INLINE batch log(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::log(x, A {}); } /** * @ingroup batch_math * Computes the base 2 logarithm of the batch \c x. * @param x batch of floating point values. * @return the base 2 logarithm of \c x. */ template XSIMD_INLINE batch log2(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::log2(x, A {}); } /** * @ingroup batch_math * Computes the base 10 logarithm of the batch \c x. * @param x batch of floating point values. * @return the base 10 logarithm of \c x. */ template XSIMD_INLINE batch log10(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::log10(x, A {}); } /** * @ingroup batch_math * Computes the natural logarithm of one plus the batch \c x. * @param x batch of floating point values. * @return the natural logarithm of one plus \c x. */ template XSIMD_INLINE batch log1p(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::log1p(x, A {}); } /** * @ingroup batch_logical * * Element-wise lesser than comparison of batches \c x and \c y. * @param x batch involved in the comparison. * @param y batch involved in the comparison. * @return a boolean batch. */ template XSIMD_INLINE batch_bool lt(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x < y; } /** * @ingroup batch_math * * Computes the larger values of the batches \c x and \c y. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @return a batch of the larger values. */ template XSIMD_INLINE batch max(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::max(x, y, A {}); } /** * @ingroup batch_math * * Computes the smaller values of the batches \c x and \c y. * @param x a batch of integer or floating point values. * @param y a batch of integer or floating point values. * @return a batch of the smaller values. */ template XSIMD_INLINE batch min(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::min(x, y, A {}); } /** * @ingroup batch_constant * * Return a batch of scalars representing positive infinity * @return a batch of positive infinity */ template XSIMD_INLINE B minusinfinity() noexcept { using T = typename B::value_type; using A = typename B::arch_type; detail::static_check_supported_config(); return B(-std::numeric_limits::infinity()); } /** * @ingroup batch_arithmetic * * Computes the integer modulo of the batch \c x by the batch \c y. * @param x batch involved in the modulo. * @param y batch involved in the modulo. * @return the result of the modulo. */ template XSIMD_INLINE batch mod(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x % y; } /** * @ingroup batch_arithmetic * * Computes the product of the batches \c x and \c y. * @tparam X the actual type of batch. * @param x batch involved in the product. * @param y batch involved in the product. * @return the result of the product. */ template XSIMD_INLINE batch mul(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x * y; } /** * @ingroup batch_rounding * * Rounds the scalars in \c x to integer values (in floating point format), using * the current rounding mode. * @param x batch of floating point values. * @return the batch of nearest integer values. */ template XSIMD_INLINE batch nearbyint(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::nearbyint(x, A {}); } /** * @ingroup batch_rounding * * Rounds the scalars in \c x to integer values (in integer format) using * the current rounding mode. * @param x batch of floating point values. * @return the batch of nearest integer values. * * @warning For very large values the conversion to int silently overflows. */ template XSIMD_INLINE batch, A> nearbyint_as_int(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::nearbyint_as_int(x, A {}); } /** * @ingroup batch_logical * * Element-wise inequality comparison of batches \c x and \c y. * @param x batch involved in the comparison. * @param y batch involved in the comparison. * @return a boolean batch. */ template XSIMD_INLINE auto neq(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x != y; } /** * @ingroup batch_logical * * Element-wise inequality comparison of batches of boolean values \c x and \c y. * @param x batch of booleans involved in the comparison. * @param y batch of booleans involved in the comparison. * @return a boolean batch. */ template XSIMD_INLINE batch_bool neq(batch_bool const& x, batch_bool const& y) noexcept { detail::static_check_supported_config(); return x != y; } /** * @ingroup batch_arithmetic * * Computes the opposite of the batch \c x. * @param x batch involved in the operation. * @return the opposite of \c x. */ template XSIMD_INLINE batch neg(batch const& x) noexcept { detail::static_check_supported_config(); return -x; } /** * @ingroup batch_math_extra * * Computes the next representable floating-point * value following x in the direction of y * @param x batch of floating point values. * @param y batch of floating point values. * @return \c x raised to the power \c y. */ template XSIMD_INLINE batch nextafter(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::nextafter(x, y, A {}); } /** * @ingroup batch_complex * * Computes the norm of the batch \c x. * @param x batch of complex or real values. * @return the norm of \c x. */ template XSIMD_INLINE real_batch_type_t> norm(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::norm(x, A {}); } /** * @ingroup batch_math * * Returns a complex batch with magnitude \c r and phase angle \c theta. * @param r The magnitude of the desired complex result. * @param theta The phase angle of the desired complex result. * @return \c r exp(i * \c theta). */ template XSIMD_INLINE complex_batch_type_t> polar(batch const& r, batch const& theta = batch {}) noexcept { detail::static_check_supported_config(); return kernel::polar(r, theta, A {}); } /** * @ingroup batch_arithmetic * * No-op on \c x. * @param x batch involved in the operation. * @return \c x. */ template XSIMD_INLINE batch pos(batch const& x) noexcept { detail::static_check_supported_config(); return +x; } /** * @ingroup batch_math * * Computes the value of the batch \c x raised to the power * \c y. * @param x batch of floating point values. * @param y batch of floating point values. * @return \c x raised to the power \c y. */ template XSIMD_INLINE batch pow(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::pow(x, y, A {}); } /** * @ingroup batch_math * * Computes the value of the batch \c x raised to the power * \c y. * @param x batch of complex floating point values. * @param y batch of floating point values. * @return \c x raised to the power \c y. */ template XSIMD_INLINE batch, A> pow(batch, A> const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::pow(x, y, A {}); } /** * @ingroup batch_math * * Computes the value of the batch \c x raised to the power * \c y. * @param x batch of complex floating point values. * @param y batch of floating point values. * @return \c x raised to the power \c y. */ template XSIMD_INLINE batch, A> pow(batch const& x, batch, A> const& y) noexcept { detail::static_check_supported_config(); return kernel::pow(x, y, A {}); } /** * @ingroup batch_math * * Computes the value of the batch \c x raised to the power * \c y. * @param x batch of integral values. * @param y batch of integral values. * @return \c x raised to the power \c y. */ template ::value>> XSIMD_INLINE batch pow(batch const& x, ITy y) noexcept { detail::static_check_supported_config(); return kernel::ipow(x, y, A {}); } /** * @ingroup batch_complex * * Computes the projection of the batch \c z. * @param z batch of complex or real values. * @return the projection of \c z. */ template XSIMD_INLINE complex_batch_type_t> proj(batch const& z) noexcept { detail::static_check_supported_config(); return kernel::proj(z, A {}); } /** * @ingroup batch_complex * * Computes the real part of the batch \c z. * @param z batch of complex or real values. * @return the argument of \c z. */ template XSIMD_INLINE real_batch_type_t> real(batch const& z) noexcept { detail::static_check_supported_config(); return kernel::real(z, A {}); } /** * @ingroup batch_arithmetic * * Computes the approximate reciprocal of the batch \c x. * The maximum relative error for this approximation is * less than 1.5*2^-12. * @param x batch of floating point numbers. * @return the reciprocal. */ template ::value>> XSIMD_INLINE batch reciprocal(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::reciprocal(x, A {}); } /** * @ingroup batch_reducers * * Generic reducer using only batch operations * @param f reducing function, accepting `batch ()(batch, batch)` * @param x batch involved in the reduction * @return the result of the reduction, as a scalar. */ template XSIMD_INLINE T reduce(F&& f, batch const& x) noexcept { detail::static_check_supported_config(); return kernel::detail::reduce(std::forward(f), x, std::integral_constant::size>()); } /** * @ingroup batch_reducers * * Adds all the scalars of the batch \c x. * @param x batch involved in the reduction * @return the result of the reduction. */ template XSIMD_INLINE T reduce_add(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::reduce_add(x, A {}); } /** * @ingroup batch_reducers * * Max of all the scalars of the batch \c x. * @param x batch involved in the reduction * @return the result of the reduction. */ template XSIMD_INLINE T reduce_max(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::reduce_max(x, A {}); } /** * @ingroup batch_reducers * * Min of all the scalars of the batch \c x. * @param x batch involved in the reduction * @return the result of the reduction. */ template XSIMD_INLINE T reduce_min(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::reduce_min(x, A {}); } /** * @ingroup batch_reducers * * Multiplies of all the scalars of the batch \c x. * @param x batch involved in the reduction * @return the result of the reduction. */ template XSIMD_INLINE T reduce_mul(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::reduce_mul(x, A {}); } /** * @ingroup batch_math * * Computes the remainder of dividing \c x by \c y * @param x batch of scalar values * @param y batch of scalar values * @return the result of the addition. */ template XSIMD_INLINE batch remainder(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::remainder(x, y, A {}); } /** * @ingroup batch_rounding * * Rounds the scalars in \c x to integer values (in floating point format), using * the current rounding mode. * @param x batch of floating point values. * @return the batch of rounded values. */ template XSIMD_INLINE batch rint(batch const& x) noexcept { detail::static_check_supported_config(); return nearbyint(x); } /** * @ingroup batch_data_transfer * * Slide the whole batch to the left by \c n elements, and reintroduce the * slided out elements from the right. This is different from * \c rotl that rotates each batch element to the left. * * @tparam N Amount of elements to rotate to the left. * @param x batch of integer values. * @return rotated batch. */ template XSIMD_INLINE batch rotate_left(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::rotate_left(x, A {}); } /** * @ingroup batch_data_transfer * * Slide the whole batch to the right by \c n elements, and reintroduce the * slided out elements from the left. This is different from * \c rotr that rotates each batch element to the right. * * @tparam N Amount of elements to rotate to the right. * @param x batch of integer values. * @return rotated batch. */ template XSIMD_INLINE batch rotate_right(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::rotate_right(x, A {}); } /** * @ingroup batch_bitwise * * Perform a bitwise shift to the left, reintroducing the shifted out bits * to the right * @param x batch to rotate * @param shift scalar amount to shift * @return rotated \c x. */ template XSIMD_INLINE batch rotl(batch const& x, int shift) noexcept { detail::static_check_supported_config(); return kernel::rotl(x, shift, A {}); } template XSIMD_INLINE batch rotl(batch const& x, batch const& shift) noexcept { detail::static_check_supported_config(); return kernel::rotl(x, shift, A {}); } template XSIMD_INLINE batch rotl(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::rotl(x, A {}); } /** * @ingroup batch_bitwise * * Perform a bitwise shift to the right, reintroducing the shifted out bits * to the left. * @param x batch to rotate * @param shift scalar amount to shift * @return rotated \c x. */ template XSIMD_INLINE batch rotr(batch const& x, int shift) noexcept { detail::static_check_supported_config(); return kernel::rotr(x, shift, A {}); } template XSIMD_INLINE batch rotr(batch const& x, batch const& shift) noexcept { detail::static_check_supported_config(); return kernel::rotr(x, shift, A {}); } template XSIMD_INLINE batch rotr(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::rotr(x, A {}); } /** * @ingroup batch_rounding * * Computes the batch of nearest integer values to scalars in \c x (in * floating point format), rounding halfway cases away from zero, regardless * of the current rounding mode. * @param x batch of flaoting point values. * @return the batch of nearest integer values. */ template XSIMD_INLINE batch round(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::round(x, A {}); } /** * @ingroup batch_math * * Computes an estimate of the inverse square root of the batch \c x. * * @warning Unlike most xsimd function, this does not return the same result as the * equivalent scalar operation, trading accuracy for speed. * * @param x batch of floating point values. * @return the inverse square root of \c x. */ template XSIMD_INLINE batch rsqrt(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::rsqrt(x, A {}); } /** * @ingroup batch_arithmetic * * Computes the saturate sum of the batch \c x and the batch \c y. * @tparam X the actual type of batch. * @param x batch involved in the saturated addition. * @param y batch involved in the saturated addition. * @return the result of the saturated addition. */ template XSIMD_INLINE batch sadd(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::sadd(x, y, A {}); } /** * @ingroup batch_cond * * Ternary operator for batches: selects values from the batches \c true_br or \c false_br * depending on the boolean values in the constant batch \c cond. Equivalent to * \code{.cpp} * for(std::size_t i = 0; i < N; ++i) * res[i] = cond[i] ? true_br[i] : false_br[i]; * \endcode * @param cond batch condition. * @param true_br batch values for truthy condition. * @param false_br batch value for falsy condition. * @return the result of the selection. */ template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br) noexcept { detail::static_check_supported_config(); return kernel::select(cond, true_br, false_br, A {}); } /** * @ingroup batch_bool_logical * * Ternary operator for conditions: selects values from the batches \c true_br or \c false_br * depending on the boolean values in the constant batch \c cond. Equivalent to * \code{.cpp} * for(std::size_t i = 0; i < N; ++i) * res[i] = cond[i] ? true_br[i] : false_br[i]; * \endcode * @param cond batch condition. * @param true_br batch values for truthy condition. * @param false_br batch value for falsy condition. * @return the result of the selection. */ template XSIMD_INLINE batch_bool select(batch_bool const& cond, batch_bool const& true_br, batch_bool const& false_br) noexcept { detail::static_check_supported_config(); return kernel::select(cond, true_br, false_br, A {}); } /** * @ingroup batch_cond * * Ternary operator for batches: selects values from the batches \c true_br or \c false_br * depending on the boolean values in the constant batch \c cond. Equivalent to * \code{.cpp} * for(std::size_t i = 0; i < N; ++i) * res[i] = cond[i] ? true_br[i] : false_br[i]; * \endcode * @param cond batch condition. * @param true_br batch values for truthy condition. * @param false_br batch value for falsy condition. * @return the result of the selection. */ template XSIMD_INLINE batch, A> select(batch_bool const& cond, batch, A> const& true_br, batch, A> const& false_br) noexcept { detail::static_check_supported_config(); return kernel::select(cond, true_br, false_br, A {}); } /** * @ingroup batch_cond * * Ternary operator for batches: selects values from the batches \c true_br or \c false_br * depending on the boolean values in the constant batch \c cond. Equivalent to * \code{.cpp} * for(std::size_t i = 0; i < N; ++i) * res[i] = cond[i] ? true_br[i] : false_br[i]; * \endcode * @param cond constant batch condition. * @param true_br batch values for truthy condition. * @param false_br batch value for falsy condition. * @return the result of the selection. */ template XSIMD_INLINE batch select(batch_bool_constant const& cond, batch const& true_br, batch const& false_br) noexcept { detail::static_check_supported_config(); return kernel::select(cond, true_br, false_br, A {}); } /** * @ingroup batch_cond * * Ternary operator for mask batches: selects values from the masks \c true_br or \c false_br * depending on the boolean values in the constant batch \c cond. Equivalent to * \code{.cpp} * for(std::size_t i = 0; i < N; ++i) * res[i] = cond[i] ? true_br[i] : false_br[i]; * \endcode * @param cond constant batch condition. * @param true_br batch values for truthy condition. * @param false_br batch value for falsy condition. * @return the result of the selection. */ template XSIMD_INLINE batch_bool select(batch_bool_constant const& cond, batch_bool const& true_br, batch_bool const& false_br) noexcept { detail::static_check_supported_config(); return kernel::select(cond, true_br, false_br, A {}); } /** * @ingroup batch_data_transfer * * Combine elements from \c x and \c y according to selector \c mask * @param x batch * @param y batch * @param mask constant batch mask of integer elements of the same size as * element of \c x and \c y. Each element of the mask index the vector that * would be formed by the concatenation of \c x and \c y. For instance * \code{.cpp} * batch_constant * \endcode * Picks \c x[0], \c y[0], \c x[3], \c y[3] * * @return combined batch */ template XSIMD_INLINE std::enable_if_t::value, batch> shuffle(batch const& x, batch const& y, batch_constant mask) noexcept { static_assert(sizeof(T) == sizeof(Vt), "consistent mask"); static_assert(std::is_unsigned::value, "mask must hold unsigned indices"); detail::static_check_supported_config(); return kernel::shuffle(x, y, mask, A {}); } /** * @ingroup batch_miscellaneous * * Computes the sign of \c x * @param x batch * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element */ template XSIMD_INLINE batch sign(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::sign(x, A {}); } /** * @ingroup batch_miscellaneous * * Computes the sign of \c x, assuming x doesn't have any zero * @param x batch * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element */ template XSIMD_INLINE batch signnz(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::signnz(x, A {}); } /** * @ingroup batch_trigo * * Computes the sine of the batch \c x. * @param x batch of floating point values. * @return the sine of \c x. */ template XSIMD_INLINE batch sin(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::sin(x, A {}); } /** * @ingroup batch_trigo * * Computes the sine and the cosine of the batch \c x. This method is faster * than calling sine and cosine independently. * @param x batch of floating point values. * @return a pair containing the sine then the cosine of batch \c x */ template XSIMD_INLINE std::pair, batch> sincos(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::sincos(x, A {}); } /** * @ingroup batch_trigo * * Computes the hyperbolic sine of the batch \c x. * @param x batch of floating point values. * @return the hyperbolic sine of \c x. */ template XSIMD_INLINE batch sinh(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::sinh(x, A {}); } /** * @ingroup batch_data_transfer * * Slide the whole batch to the left by \c n bytes. This is different from * \c bitwise_lshift that shifts each batch element to the left. * * @warning The behavior of this function is platform-dependent on big * endian architectures. * * @tparam N Amount of bytes to slide to the left. * @param x batch of integer values. * @return slided batch. */ template XSIMD_INLINE batch slide_left(batch const& x) noexcept { static_assert(std::is_integral::value, "can only slide batch of integers"); detail::static_check_supported_config(); return kernel::slide_left(x, A {}); } /** * @ingroup batch_data_transfer * * Slide the whole batch to the right by \c N bytes. This is different from * \c bitwise_rshift that shifts each batch element to the right. * * @warning The behavior of this function is platform-dependent on big * endian architectures. * * @tparam N Amount of bytes to slide to the right. * @param x batch of integer values. * @return slided batch. */ template XSIMD_INLINE batch slide_right(batch const& x) noexcept { static_assert(std::is_integral::value, "can only slide batch of integers"); detail::static_check_supported_config(); return kernel::slide_right(x, A {}); } /** * @ingroup batch_math * * Computes the square root of the batch \c x. * @param x batch of floating point values. * @return the square root of \c x. */ template XSIMD_INLINE batch sqrt(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::sqrt(x, A {}); } /** * @ingroup batch_arithmetic * * Computes the saturate difference of the batch \c x and the batch \c y. * @tparam X the actual type of batch. * @param x batch involved in the saturated difference. * @param y batch involved in the saturated difference. * @return the result of the saturated difference. */ template XSIMD_INLINE batch ssub(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::ssub(x, y, A {}); } /** * @ingroup batch_data_transfer * * Copy content of batch \c src to the buffer \c dst. The * memory needs to be aligned. * @param dst the memory buffer to write to * @param src the batch to copy */ template XSIMD_INLINE void store_as(To* dst, batch const& src, aligned_mode) noexcept { detail::static_check_supported_config(); kernel::store_aligned(dst, src, A {}); } template XSIMD_INLINE void store_as(bool* dst, batch_bool const& src, aligned_mode) noexcept { detail::static_check_supported_config(); kernel::store(src, dst, A {}); } template XSIMD_INLINE void store_as(std::complex* dst, batch, A> const& src, aligned_mode) noexcept { detail::static_check_supported_config, A>(); kernel::store_complex_aligned(dst, src, A {}); } template XSIMD_INLINE void store_as(To* dst, batch const& src, stream_mode) noexcept { detail::static_check_supported_config(); kernel::store_stream(dst, src, A {}); } template XSIMD_INLINE void store_as(bool* dst, batch_bool const& src, stream_mode) noexcept { detail::static_check_supported_config(); kernel::store_stream(src, dst, A {}); } template XSIMD_INLINE void store_as(std::complex* dst, batch, A> const& src, stream_mode) noexcept { detail::static_check_supported_config, A>(); kernel::store_complex_stream(dst, src, A {}); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE void store_as(xtl::xcomplex* dst, batch, A> const& src, aligned_mode) noexcept { store_as(reinterpret_cast*>(dst), src, aligned_mode()); } template XSIMD_INLINE void store_as(xtl::xcomplex* dst, batch, A> const& src, stream_mode) noexcept { detail::static_check_supported_config, A>(); store_as(reinterpret_cast*>(dst), src, stream_mode()); } #endif /** * @ingroup batch_data_transfer * * Copy content of batch \c src to the buffer \c dst. The * memory does not need to be aligned. * @param dst the memory buffer to write to * @param src the batch to copy */ template XSIMD_INLINE void store_as(To* dst, batch const& src, unaligned_mode) noexcept { detail::static_check_supported_config(); kernel::store_unaligned(dst, src, A {}); } template XSIMD_INLINE void store_as(bool* dst, batch_bool const& src, unaligned_mode) noexcept { detail::static_check_supported_config(); kernel::store(src, dst, A {}); } template XSIMD_INLINE void store_as(std::complex* dst, batch, A> const& src, unaligned_mode) noexcept { detail::static_check_supported_config, A>(); kernel::store_complex_unaligned(dst, src, A {}); } #ifdef XSIMD_ENABLE_XTL_COMPLEX template XSIMD_INLINE void store_as(xtl::xcomplex* dst, batch, A> const& src, unaligned_mode) noexcept { detail::static_check_supported_config, A>(); store_as(reinterpret_cast*>(dst), src, unaligned_mode()); } #endif /** * @ingroup batch_data_transfer * * Copy content of batch \c val to the buffer \c mem. The * memory does not need to be aligned. * @param mem the memory buffer to write to * @param val the batch to copy from */ template XSIMD_INLINE void store(T* mem, batch const& val, aligned_mode = {}) noexcept { store_as(mem, val, aligned_mode {}); } /** * @ingroup batch_data_transfer * * Copy content of batch \c val to the buffer \c mem. The * memory does not need to be aligned. * @param mem the memory buffer to write to * @param val the batch to copy from */ template XSIMD_INLINE void store(T* mem, batch const& val, unaligned_mode) noexcept { store_as(mem, val, unaligned_mode {}); } template XSIMD_INLINE void store(T* mem, batch const& val, stream_mode) noexcept { store_as(mem, val, stream_mode {}); } /** * @ingroup batch_data_transfer * * Copy selected elements of batch \c val to the buffer \c mem using * a mask. Elements corresponding to \c false in the mask are not * written to memory. * @param mem the memory buffer to write to * @param val the batch to copy from * @param mask selection mask for the elements to store */ template XSIMD_INLINE void store(T* mem, batch const& val, batch_bool_constant const& mask, aligned_mode = {}) noexcept { detail::static_check_supported_config(); val.store(mem, mask, aligned_mode {}); } /** * @ingroup batch_data_transfer * * Copy selected elements of batch \c val to the buffer \c mem using a mask. * Elements corresponding to \c false in the mask are not written to memory. * @param mem the memory buffer to write to. The buffer does not need to be * aligned. * @param val the batch to copy from * @param mask selection mask for the elements to store */ template XSIMD_INLINE void store(T* mem, batch const& val, batch_bool_constant const& mask, unaligned_mode) noexcept { detail::static_check_supported_config(); val.store(mem, mask, unaligned_mode {}); } /** * @ingroup batch_data_transfer * * Copy content of batch \c val to the buffer \c mem. The * memory needs to be aligned. * @param mem the memory buffer to write to * @param val the batch to copy from */ template XSIMD_INLINE void store_aligned(T* mem, batch const& val) noexcept { store_as(mem, val, aligned_mode {}); } /** * @ingroup batch_data_transfer * * Copy content of batch \c val to the buffer \c mem. The * memory does not need to be aligned. * @param mem the memory buffer to write to * @param val the batch to copy */ template XSIMD_INLINE void store_unaligned(T* mem, batch const& val) noexcept { store_as(mem, val, unaligned_mode {}); } /** * @ingroup batch_arithmetic * * Computes the difference between \c x and \c y * @tparam X the actual type of batch. * @param x scalar or batch of scalars * @param y scalar or batch of scalars * @return the difference between \c x and \c y */ template XSIMD_INLINE batch sub(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return x - y; } /** * @ingroup batch_data_transfer * * Rearrange elements from \c x according to constant mask \c mask * @param x batch * @param mask constant batch mask of integer elements of the same size as * element of \c x * @return swizzled batch */ template XSIMD_INLINE std::enable_if_t::value, batch> swizzle(batch const& x, batch_constant mask) noexcept { static_assert(sizeof(T) == sizeof(Vt), "consistent mask"); detail::static_check_supported_config(); return kernel::swizzle(x, mask, A {}); } template XSIMD_INLINE batch, A> swizzle(batch, A> const& x, batch_constant mask) noexcept { static_assert(sizeof(T) == sizeof(Vt), "consistent mask"); static_assert(std::is_unsigned::value, "mask must hold unsigned indices"); detail::static_check_supported_config(); return kernel::swizzle(x, mask, A {}); } /** * @ingroup batch_data_transfer * * Rearrange elements from \c x according to mask \c mask * @param x batch * @param mask batch mask of integer elements of the same size as * element of \c x * @return swizzled batch */ template XSIMD_INLINE std::enable_if_t::value, batch> swizzle(batch const& x, batch mask) noexcept { static_assert(sizeof(T) == sizeof(Vt), "consistent mask"); detail::static_check_supported_config(); return kernel::swizzle(x, mask, A {}); } template XSIMD_INLINE batch, A> swizzle(batch, A> const& x, batch mask) noexcept { static_assert(sizeof(T) == sizeof(Vt), "consistent mask"); detail::static_check_supported_config(); return kernel::swizzle(x, mask, A {}); } /** * @ingroup batch_trigo * * Computes the tangent of the batch \c x. * @param x batch of floating point values. * @return the tangent of \c x. */ template XSIMD_INLINE batch tan(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::tan(x, A {}); } /** * @ingroup batch_trigo * * Computes the hyperbolic tangent of the batch \c x. * @param x batch of floating point values. * @return the hyperbolic tangent of \c x. */ template XSIMD_INLINE batch tanh(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::tanh(x, A {}); } /** * @ingroup batch_math_extra * * Computes the gamma function of the batch \c x. * @param x batch of floating point values. * @return the gamma function of \c x. */ template XSIMD_INLINE batch tgamma(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::tgamma(x, A {}); } /** * @ingroup batch_conversion * * Perform a conversion from \c i to a value of an floating point type of the same size as \c T. * This is equivalent to \c batch_cast>(i) * @param i batch of integers. * @return \c i converted to a value of an floating point type of the same size as \c T */ template XSIMD_INLINE batch, A> to_float(batch const& i) noexcept { detail::static_check_supported_config(); return batch_cast>(i); } /** * @ingroup batch_conversion * * Perform a conversion from \c x to a value of an integer type of the same size as \c T * This is equivalent to \c batch_cast>(x) * @param x batch. * @return \c x converted to a value of an integer type of the same size as \c T */ template XSIMD_INLINE batch, A> to_int(batch const& x) noexcept { detail::static_check_supported_config(); return batch_cast>(x); } /** * @ingroup batch_data_transfer * * Transposes in place the matrix whose line are each of the batch passed as * argument. * @param matrix_begin pointer to the first line of the matrix to transpose * @param matrix_end pointer to one element after the last line of the matrix to transpose * */ template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); detail::static_check_supported_config(); return kernel::transpose(matrix_begin, matrix_end, A {}); } /** * @ingroup batch_rounding * * Computes the batch of nearest integer values not greater in magnitude * than scalars in \c x. * @param x batch of floating point values. * @return the batch of nearest integer values not greater in magnitude than \c x. */ template XSIMD_INLINE batch trunc(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::trunc(x, A {}); } /** * @ingroup batch_data_transfer * * Unpack and interleave data from the HIGH half of batches \c x and \c y. * Store the results in the Return value. * @param x a batch of integer or floating point or double precision values. * @param y a batch of integer or floating point or double precision values. * @return a batch of the high part of shuffled values. */ template XSIMD_INLINE batch zip_hi(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::zip_hi(x, y, A {}); } /** * @ingroup batch_data_transfer * * Unpack and interleave data from the LOW half of batches \c x and \c y. * Store the results in the Return value. * @param x a batch of integer or floating point or double precision values. * @param y a batch of integer or floating point or double precision values. * @return a batch of the low part of shuffled values. */ template XSIMD_INLINE batch zip_lo(batch const& x, batch const& y) noexcept { detail::static_check_supported_config(); return kernel::zip_lo(x, y, A {}); } /** * @ingroup batch_conversion * * Cast a \c batch_bool of \c T into a \c batch of the same type using the * following rule: if an element of \c self is true, it maps to -1 in the * returned integral batch, otherwise it maps to 0. * * @param self batch_bool of \c T * @return \c self cast to a \c batch of \c T */ template ::value, int> = 3> XSIMD_INLINE batch bitwise_cast(batch_bool const& self) noexcept { T z(0); detail::static_check_supported_config(); return select(self, batch(T(~z)), batch(z)); } template ::value, int> = 3> XSIMD_INLINE batch bitwise_cast(batch_bool const& self) noexcept { T z0(0), z1(0); using int_type = as_unsigned_integer_t; int_type value(~int_type(0)); std::memcpy(&z1, &value, sizeof(int_type)); detail::static_check_supported_config(); return select(self, batch(z1), batch(z0)); } /** * @ingroup batch_bool_reducers * * Returns true if all the boolean values in the batch are true, * false otherwise. * @param x the batch to reduce. * @return a boolean scalar. */ template XSIMD_INLINE bool all(batch_bool const& x) noexcept { detail::static_check_supported_config(); return kernel::all(x, A {}); } /** * @ingroup batch_bool_reducers * * Return true if any of the boolean values in the batch is true, * false otherwise. * @param x the batch to reduce. * @return a boolean scalar. */ template XSIMD_INLINE bool any(batch_bool const& x) noexcept { detail::static_check_supported_config(); return kernel::any(x, A {}); } /** * @ingroup batch_bool_reducers * * Return true if none of the boolean values in the batch is true, * false otherwise. * @param x the batch to reduce. * @return a boolean scalar. */ template XSIMD_INLINE bool none(batch_bool const& x) noexcept { detail::static_check_supported_config(); return !xsimd::any(x); } /** * @ingroup batch_bool_reducers * * Return the number of leading `false` values in the batch. * @param x the batch to reduce. * @return an integer scalar. */ template XSIMD_INLINE size_t countl_zero(batch_bool const& x) noexcept { detail::static_check_supported_config(); return kernel::countl_zero(x, A {}); } /** * @ingroup batch_bool_reducers * * Return the number of leading `true` values in the batch. * @param x the batch to reduce. * @return an integer scalar. */ template XSIMD_INLINE size_t countl_one(batch_bool const& x) noexcept { detail::static_check_supported_config(); return kernel::countl_one(x, A {}); } /** * @ingroup batch_bool_reducers * * Return the number of trailing `false` values in the batch. * @param x the batch to reduce. * @return an integer scalar. */ template XSIMD_INLINE size_t countr_zero(batch_bool const& x) noexcept { detail::static_check_supported_config(); return kernel::countr_zero(x, A {}); } /** * @ingroup batch_bool_reducers * * Return the number of trailing `true` values in the batch. * @param x the batch to reduce. * @return an integer scalar. */ template XSIMD_INLINE size_t countr_one(batch_bool const& x) noexcept { detail::static_check_supported_config(); return kernel::countr_one(x, A {}); } /** * @ingroup batch_conversion * * Widen batch \c x from type \c T to a type with twice as many bytes and * the same sign (for integers) or from float to double. * @param x batch of \c T * @return two batches of \c widen_t */ template XSIMD_INLINE std::array, A>, 2> widen(batch const& x) noexcept { detail::static_check_supported_config(); return kernel::widen(x, A {}); } /** * @ingroup batch_miscellaneous * * Dump the content of batch \c x to stream \c o * @param o the stream where the batch is dumped * @param x batch to dump. * @return a reference to \c o */ template XSIMD_INLINE std::ostream& operator<<(std::ostream& o, batch const& x) noexcept { detail::static_check_supported_config(); constexpr auto size = batch::size; alignas(A::alignment()) T buffer[size]; x.store_aligned(&buffer[0]); o << '('; for (std::size_t i = 0; i < size - 1; ++i) o << buffer[i] << ", "; return o << buffer[size - 1] << ')'; } /** * @ingroup batch_miscellaneous * * Dump the content of batch \c x to stream \c o * @param o the stream where the batch is dumped * @param x batch to dump. * @return a reference to \c o */ template XSIMD_INLINE std::ostream& operator<<(std::ostream& o, batch_bool const& x) noexcept { detail::static_check_supported_config(); constexpr auto size = batch_bool::size; alignas(A::alignment()) bool buffer[size]; x.store_aligned(&buffer[0]); o << '('; for (std::size_t i = 0; i < size - 1; ++i) o << buffer[i] << ", "; return o << buffer[size - 1] << ')'; } } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx2_register.hpp000066400000000000000000000026411517435117100263460ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX2_REGISTER_HPP #define XSIMD_AVX2_REGISTER_HPP #include "./xsimd_avx_register.hpp" namespace xsimd { /** * @ingroup architectures * * AVX2 instructions */ struct avx2 : avx { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avx2"; } }; #if XSIMD_WITH_AVX2 #if !XSIMD_WITH_AVX #error "architecture inconsistency: avx2 requires avx" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx512bw_register.hpp000066400000000000000000000031721517435117100270450ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512BW_REGISTER_HPP #define XSIMD_AVX512BW_REGISTER_HPP #include "./xsimd_avx512dq_register.hpp" namespace xsimd { /** * @ingroup architectures * * AVX512BW instructions */ struct avx512bw : avx512dq { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512BW; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avx512bw"; } }; #if XSIMD_WITH_AVX512BW #if !XSIMD_WITH_AVX512DQ #error "architecture inconsistency: avx512bw requires avx512dq" #endif namespace types { template struct get_bool_simd_register { using type = simd_avx512_bool_register; }; XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512bw, avx512dq); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx512cd_register.hpp000066400000000000000000000031651517435117100270250ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512CD_REGISTER_HPP #define XSIMD_AVX512CD_REGISTER_HPP #include "./xsimd_avx512f_register.hpp" namespace xsimd { /** * @ingroup architectures * * AVX512CD instructions */ struct avx512cd : avx512f { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512CD; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avx512cd"; } }; #if XSIMD_WITH_AVX512CD #if !XSIMD_WITH_AVX512F #error "architecture inconsistency: avx512bw requires avx512f" #endif namespace types { template struct get_bool_simd_register { using type = simd_avx512_bool_register; }; XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512cd, avx512f); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx512dq_register.hpp000066400000000000000000000031721517435117100270410ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512DQ_REGISTER_HPP #define XSIMD_AVX512DQ_REGISTER_HPP #include "./xsimd_avx512cd_register.hpp" namespace xsimd { /** * @ingroup architectures * * AVX512DQ instructions */ struct avx512dq : avx512cd { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512DQ; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avx512dq"; } }; #if XSIMD_WITH_AVX512DQ #if !XSIMD_WITH_AVX512CD #error "architecture inconsistency: avx512dq requires avx512cd" #endif namespace types { template struct get_bool_simd_register { using type = simd_avx512_bool_register; }; XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512dq, avx512cd); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx512er_register.hpp000066400000000000000000000031721517435117100270430ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512ER_REGISTER_HPP #define XSIMD_AVX512ER_REGISTER_HPP #include "./xsimd_avx512dq_register.hpp" namespace xsimd { /** * @ingroup architectures * * AVX512ER instructions */ struct avx512er : avx512cd { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512ER; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avx512er"; } }; #if XSIMD_WITH_AVX512ER #if !XSIMD_WITH_AVX512CD #error "architecture inconsistency: avx512er requires avx512cd" #endif namespace types { template struct get_bool_simd_register { using type = simd_avx512_bool_register; }; XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512er, avx512cd); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx512f_register.hpp000066400000000000000000000060671517435117100266700ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512F_REGISTER_HPP #define XSIMD_AVX512F_REGISTER_HPP #include "./xsimd_common_arch.hpp" namespace xsimd { /** * @ingroup architectures * * AVX512F instructions */ struct avx512f : common { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; } static constexpr bool available() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 64; } static constexpr bool requires_alignment() noexcept { return true; } static constexpr char const* name() noexcept { return "avx512f"; } }; #if XSIMD_WITH_AVX512F #if !XSIMD_WITH_AVX2 #error "architecture inconsistency: avx512f requires avx2" #endif namespace types { template struct simd_avx512_bool_register { using register_type = std::conditional_t< (sizeof(T) < 4), std::conditional_t<(sizeof(T) == 1), __mmask64, __mmask32>, std::conditional_t<(sizeof(T) == 4), __mmask16, __mmask8>>; register_type data; simd_avx512_bool_register() = default; simd_avx512_bool_register(register_type r) { data = r; } operator register_type() const noexcept { return data; } }; template struct get_bool_simd_register { using type = simd_avx512_bool_register; }; XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(short, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(int, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(long int, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512); XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx512ifma_register.hpp000066400000000000000000000032161517435117100273500ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512IFMA_REGISTER_HPP #define XSIMD_AVX512IFMA_REGISTER_HPP #include "./xsimd_avx512bw_register.hpp" namespace xsimd { /** * @ingroup architectures * * AVX512IFMA instructions */ struct avx512ifma : avx512bw { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512IFMA; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avx512ifma"; } }; #if XSIMD_WITH_AVX512IFMA #if !XSIMD_WITH_AVX512BW #error "architecture inconsistency: avx512ifma requires avx512bw" #endif namespace types { template struct get_bool_simd_register { using type = simd_avx512_bool_register; }; XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512ifma, avx512bw); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx512pf_register.hpp000066400000000000000000000031721517435117100270420ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512PF_REGISTER_HPP #define XSIMD_AVX512PF_REGISTER_HPP #include "./xsimd_avx512er_register.hpp" namespace xsimd { /** * @ingroup architectures * * AVX512BW instructions */ struct avx512pf : avx512er { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512PF; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avx512pf"; } }; #if XSIMD_WITH_AVX512PF #if !XSIMD_WITH_AVX512ER #error "architecture inconsistency: avx512pf requires avx512er" #endif namespace types { template struct get_bool_simd_register { using type = simd_avx512_bool_register; }; XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512pf, avx512er); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx512vbmi2_register.hpp000066400000000000000000000032411517435117100274510ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VBMI2_REGISTER_HPP #define XSIMD_AVX512VBMI2_REGISTER_HPP #include "./xsimd_avx512vbmi_register.hpp" namespace xsimd { /** * @ingroup architectures * * AVX512VBMI instructions */ struct avx512vbmi2 : avx512vbmi { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VBMI2; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avx512vbmi2"; } }; #if XSIMD_WITH_AVX512VBMI2 #if !XSIMD_WITH_AVX512VBMI #error "architecture inconsistency: avx512vbmi2 requires avx512vbmi" #endif namespace types { template struct get_bool_simd_register { using type = simd_avx512_bool_register; }; XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vbmi2, avx512vbmi); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx512vbmi_register.hpp000066400000000000000000000032301517435117100273650ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VBMI_REGISTER_HPP #define XSIMD_AVX512VBMI_REGISTER_HPP #include "./xsimd_avx512ifma_register.hpp" namespace xsimd { /** * @ingroup architectures * * AVX512VBMI instructions */ struct avx512vbmi : avx512ifma { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VBMI; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avx512vbmi"; } }; #if XSIMD_WITH_AVX512VBMI #if !XSIMD_WITH_AVX512IFMA #error "architecture inconsistency: avx512vbmi requires avx512ifma" #endif namespace types { template struct get_bool_simd_register { using type = simd_avx512_bool_register; }; XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vbmi, avx512ifma); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp000066400000000000000000000034461517435117100310520ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VNNI_AVX512BW_REGISTER_HPP #define XSIMD_AVX512VNNI_AVX512BW_REGISTER_HPP #include "./xsimd_avx512bw_register.hpp" namespace xsimd { template struct avx512vnni; /** * @ingroup architectures * * AVX512VNNI instructions */ template <> struct avx512vnni : avx512bw { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512BW; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avx512vnni+avx512bw"; } }; #if XSIMD_WITH_AVX512VNNI_AVX512BW #if !XSIMD_WITH_AVX512BW #error "architecture inconsistency: avx512vnni+avx512bw requires avx512bw" #endif namespace types { template struct get_bool_simd_register> { using type = simd_avx512_bool_register; }; XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni, avx512bw); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx512vnni_avx512vbmi2_register.hpp000066400000000000000000000035171517435117100314600ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VNNI_AVX512VBMI2_REGISTER_HPP #define XSIMD_AVX512VNNI_AVX512VBMI2_REGISTER_HPP #include "./xsimd_avx512vbmi2_register.hpp" namespace xsimd { template struct avx512vnni; /** * @ingroup architectures * * AVX512VNNI instructions */ template <> struct avx512vnni : avx512vbmi2 { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512VBMI2; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avx512vnni+avx512vbmi2"; } }; #if XSIMD_WITH_AVX512VNNI_AVX512VBMI2 #if !XSIMD_WITH_AVX512VBMI2 #error "architecture inconsistency: avx512vnni+avx512vbmi2 requires avx512vbmi2" #endif namespace types { template struct get_bool_simd_register> { using type = simd_avx512_bool_register; }; XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni, avx512vbmi2); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avx_register.hpp000066400000000000000000000045431517435117100262670ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_REGISTER_HPP #define XSIMD_AVX_REGISTER_HPP #include "./xsimd_common_arch.hpp" namespace xsimd { /** * @ingroup architectures * * AVX instructions */ struct avx : common { static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; } static constexpr bool available() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 32; } static constexpr bool requires_alignment() noexcept { return true; } static constexpr char const* name() noexcept { return "avx"; } }; } #if XSIMD_WITH_AVX #if !XSIMD_WITH_SSE4_2 #error "architecture inconsistency: avx requires sse4.2" #endif #include namespace xsimd { namespace types { XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(short, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(int, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(long int, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256); XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d); } } #endif #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_avxvnni_register.hpp000066400000000000000000000027011517435117100271540ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVXVNNI_REGISTER_HPP #define XSIMD_AVXVNNI_REGISTER_HPP #include "./xsimd_avx2_register.hpp" namespace xsimd { /** * @ingroup architectures * * AVXVNNI instructions */ struct avxvnni : avx2 { static constexpr bool supported() noexcept { return XSIMD_WITH_AVXVNNI; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "avxvnni"; } }; #if XSIMD_WITH_AVXVNNI #if !XSIMD_WITH_AVX2 #error "architecture inconsistency: avxvnni requires avx2" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avxvnni, avx2); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_batch.hpp000066400000000000000000001700041517435117100246420ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_BATCH_HPP #define XSIMD_BATCH_HPP #include #include #include "../config/xsimd_arch.hpp" #include "../config/xsimd_macros.hpp" #include "../memory/xsimd_alignment.hpp" #include "./xsimd_batch_fwd.hpp" #include "./xsimd_utils.hpp" namespace xsimd { namespace types { template struct integral_only_operators { XSIMD_INLINE batch& operator%=(batch const& other) noexcept; XSIMD_INLINE batch& operator>>=(int32_t other) noexcept; XSIMD_INLINE batch& operator>>=(batch const& other) noexcept; XSIMD_INLINE batch& operator<<=(int32_t other) noexcept; XSIMD_INLINE batch& operator<<=(batch const& other) noexcept; /** Shorthand for xsimd::mod() */ friend XSIMD_INLINE batch operator%(batch const& self, batch const& other) noexcept { return batch(self) %= other; } /** Shorthand for xsimd::bitwise_rshift() */ friend XSIMD_INLINE batch operator>>(batch const& self, batch const& other) noexcept { return batch(self) >>= other; } /** Shorthand for xsimd::bitwise_lshift() */ friend XSIMD_INLINE batch operator<<(batch const& self, batch const& other) noexcept { return batch(self) <<= other; } /** Shorthand for xsimd::bitwise_rshift() */ friend XSIMD_INLINE batch operator>>(batch const& self, int32_t other) noexcept { return batch(self) >>= other; } /** Shorthand for xsimd::bitwise_lshift() */ friend XSIMD_INLINE batch operator<<(batch const& self, int32_t other) noexcept { return batch(self) <<= other; } }; template struct integral_only_operators { }; template struct integral_only_operators { }; } namespace details { // These functions are forwarded declared here so that they can be used by friend functions // with batch. Their implementation must appear only once the // kernel implementations have been included. template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other) noexcept; template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other) noexcept; template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other) noexcept; template XSIMD_INLINE batch_bool le(batch const& self, batch const& other) noexcept; template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other) noexcept; template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other) noexcept; } /** * @brief batch of integer or floating point values. * * Abstract representation of an SIMD register for floating point or integral * value. * * @tparam T the type of the underlying values. * @tparam A the architecture this batch is tied too. **/ template class batch : public types::simd_register, public types::integral_only_operators { static_assert(!std::is_same::value, "use xsimd::batch_bool instead of xsimd::batch"); public: static constexpr std::size_t size = sizeof(types::simd_register) / sizeof(T); ///< Number of scalar elements in this batch. using value_type = T; ///< Type of the scalar elements within this batch. using arch_type = A; ///< SIMD Architecture abstracted by this batch. using register_type = typename types::simd_register::register_type; ///< SIMD register type abstracted by this batch. using batch_bool_type = batch_bool; ///< Associated batch type used to represented logical operations on this batch. // constructors XSIMD_INLINE batch() = default; ///< Create a batch initialized with undefined values. XSIMD_INLINE batch(T val) noexcept; template XSIMD_INLINE batch(T val0, T val1, Ts... vals) noexcept; XSIMD_INLINE explicit batch(batch_bool_type const& b) noexcept; XSIMD_INLINE batch(register_type reg) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch broadcast(U val) noexcept; // memory operators template XSIMD_INLINE void store_aligned(U* mem) const noexcept; template XSIMD_INLINE void store_unaligned(U* mem) const noexcept; template XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept; template XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept; template XSIMD_INLINE void store(U* mem, stream_mode) const noexcept; // Compile-time mask overloads template XSIMD_INLINE void store(U* mem, batch_bool_constant mask, Mode) const noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(U const* mem) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(U const* mem) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept; // Compile-time mask overloads template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant mask, Mode = {}) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch gather(U const* src, batch const& index) noexcept; template XSIMD_INLINE void scatter(U* dst, batch const& index) const noexcept; XSIMD_INLINE T get(std::size_t i) const noexcept; XSIMD_INLINE T first() const noexcept; // comparison operators. Defined as friend to enable automatic // conversion of parameters from scalar to batch, at the cost of using a // proxy implementation from details::. friend XSIMD_INLINE batch_bool operator==(batch const& self, batch const& other) noexcept { return details::eq(self, other); } friend XSIMD_INLINE batch_bool operator!=(batch const& self, batch const& other) noexcept { return details::neq(self, other); } friend XSIMD_INLINE batch_bool operator>=(batch const& self, batch const& other) noexcept { return details::ge(self, other); } friend XSIMD_INLINE batch_bool operator<=(batch const& self, batch const& other) noexcept { return details::le(self, other); } friend XSIMD_INLINE batch_bool operator>(batch const& self, batch const& other) noexcept { return details::gt(self, other); } friend XSIMD_INLINE batch_bool operator<(batch const& self, batch const& other) noexcept { return details::lt(self, other); } // Update operators XSIMD_INLINE batch& operator+=(batch const& other) noexcept; XSIMD_INLINE batch& operator-=(batch const& other) noexcept; XSIMD_INLINE batch& operator*=(batch const& other) noexcept; XSIMD_INLINE batch& operator/=(batch const& other) noexcept; XSIMD_INLINE batch& operator&=(batch const& other) noexcept; XSIMD_INLINE batch& operator|=(batch const& other) noexcept; XSIMD_INLINE batch& operator^=(batch const& other) noexcept; // incr/decr operators XSIMD_INLINE batch& operator++() noexcept; XSIMD_INLINE batch& operator--() noexcept; XSIMD_INLINE batch operator++(int) noexcept; XSIMD_INLINE batch operator--(int) noexcept; // unary operators XSIMD_INLINE batch_bool_type operator!() const noexcept; XSIMD_INLINE batch operator~() const noexcept; XSIMD_INLINE batch operator-() const noexcept; XSIMD_INLINE batch operator+() const noexcept; // arithmetic operators. They are defined as friend to enable automatic // conversion of parameters from scalar to batch. Inline implementation // is required to avoid warnings. /** Shorthand for xsimd::add() */ friend XSIMD_INLINE batch operator+(batch const& self, batch const& other) noexcept { return batch(self) += other; } /** Shorthand for xsimd::sub() */ friend XSIMD_INLINE batch operator-(batch const& self, batch const& other) noexcept { return batch(self) -= other; } /** Shorthand for xsimd::mul() */ friend XSIMD_INLINE batch operator*(batch const& self, batch const& other) noexcept { return batch(self) *= other; } /** Shorthand for xsimd::div() */ friend XSIMD_INLINE batch operator/(batch const& self, batch const& other) noexcept { return batch(self) /= other; } /** Shorthand for xsimd::bitwise_and() */ friend XSIMD_INLINE batch operator&(batch const& self, batch const& other) noexcept { return batch(self) &= other; } /** Shorthand for xsimd::bitwise_or() */ friend XSIMD_INLINE batch operator|(batch const& self, batch const& other) noexcept { return batch(self) |= other; } /** Shorthand for xsimd::bitwise_xor() */ friend XSIMD_INLINE batch operator^(batch const& self, batch const& other) noexcept { return batch(self) ^= other; } /** Shorthand for xsimd::logical_and() */ friend XSIMD_INLINE batch operator&&(batch const& self, batch const& other) noexcept { return batch(self).logical_and(other); } /** Shorthand for xsimd::logical_or() */ friend XSIMD_INLINE batch operator||(batch const& self, batch const& other) noexcept { return batch(self).logical_or(other); } private: XSIMD_INLINE batch logical_and(batch const& other) const noexcept; XSIMD_INLINE batch logical_or(batch const& other) const noexcept; }; #if __cplusplus < 201703L template constexpr std::size_t batch::size; #endif /** * @brief batch of predicate over scalar or complex values. * * Abstract representation of a predicate over SIMD register for scalar or * complex values. * * @tparam T the type of the predicated values. * @tparam A the architecture this batch is tied too. **/ template class batch_bool : public types::get_bool_simd_register_t { using base_type = types::get_bool_simd_register_t; public: static constexpr std::size_t size = sizeof(types::simd_register) / sizeof(T); ///< Number of scalar elements in this batch. using value_type = bool; ///< Type of the scalar elements within this batch. using operand_type = T; using arch_type = A; ///< SIMD Architecture abstracted by this batch. using register_type = typename base_type::register_type; ///< SIMD register type abstracted by this batch. using batch_type = batch; ///< Associated batch type this batch represents logical operations for. // constructors XSIMD_INLINE batch_bool() = default; ///< Create a batch initialized with undefined values. XSIMD_INLINE batch_bool(bool val) noexcept; XSIMD_INLINE batch_bool(register_type reg) noexcept; template XSIMD_INLINE batch_bool(bool val0, bool val1, Ts... vals) noexcept; template XSIMD_INLINE batch_bool(Tp const*) = delete; // memory operators XSIMD_INLINE void store_aligned(bool* mem) const noexcept; XSIMD_INLINE void store_unaligned(bool* mem) const noexcept; XSIMD_INLINE void store_stream(bool* mem) const noexcept; XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_aligned(bool const* mem) noexcept; XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_unaligned(bool const* mem) noexcept; XSIMD_NO_DISCARD static XSIMD_INLINE batch_bool load_stream(bool const* mem) noexcept; XSIMD_INLINE bool get(std::size_t i) const noexcept; XSIMD_INLINE bool first() const noexcept; // mask operations XSIMD_INLINE uint64_t mask() const noexcept; XSIMD_INLINE static batch_bool from_mask(uint64_t mask) noexcept; // comparison operators XSIMD_INLINE batch_bool operator==(batch_bool const& other) const noexcept; XSIMD_INLINE batch_bool operator!=(batch_bool const& other) const noexcept; // logical operators XSIMD_INLINE batch_bool operator~() const noexcept; XSIMD_INLINE batch_bool operator!() const noexcept; XSIMD_INLINE batch_bool operator&(batch_bool const& other) const noexcept; XSIMD_INLINE batch_bool operator|(batch_bool const& other) const noexcept; XSIMD_INLINE batch_bool operator^(batch_bool const& other) const noexcept; XSIMD_INLINE batch_bool operator&&(batch_bool const& other) const noexcept; XSIMD_INLINE batch_bool operator||(batch_bool const& other) const noexcept; // update operators XSIMD_INLINE batch_bool& operator&=(batch_bool const& other) noexcept { return (*this) = (*this) & other; } XSIMD_INLINE batch_bool& operator|=(batch_bool const& other) noexcept { return (*this) = (*this) | other; } XSIMD_INLINE batch_bool& operator^=(batch_bool const& other) noexcept { return (*this) = (*this) ^ other; } private: template static XSIMD_INLINE register_type make_register(std::index_sequence, U u, V... v) noexcept; template static XSIMD_INLINE register_type make_register(std::index_sequence<>, V... v) noexcept; }; #if __cplusplus < 201703L template constexpr std::size_t batch_bool::size; #endif /** * @brief batch of complex values. * * Abstract representation of an SIMD register for complex values. * * @tparam T the type of the underlying values. * @tparam A the architecture this batch is tied too. **/ template class batch, A> { public: using value_type = std::complex; ///< Type of the complex elements within this batch. using real_batch = batch; ///< Type of the scalar elements within this batch. using arch_type = A; ///< SIMD Architecture abstracted by this batch. using batch_bool_type = batch_bool; ///< Associated batch type used to represented logical operations on this batch. static constexpr std::size_t size = real_batch::size; ///< Number of complex elements in this batch. // constructors XSIMD_INLINE batch() = default; ///< Create a batch initialized with undefined values. XSIMD_INLINE batch(value_type const& val) noexcept; XSIMD_INLINE batch(real_batch const& real, real_batch const& imag) noexcept; XSIMD_INLINE batch(real_batch const& real) noexcept; XSIMD_INLINE batch(T val) noexcept; template XSIMD_INLINE batch(value_type val0, value_type val1, Ts... vals) noexcept; XSIMD_INLINE explicit batch(batch_bool_type const& b) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch broadcast(U val) noexcept; // memory operators XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(const T* real_src, const T* imag_src = nullptr) noexcept; XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(const T* real_src, const T* imag_src = nullptr) noexcept; XSIMD_INLINE void store_aligned(T* real_dst, T* imag_dst) const noexcept; XSIMD_INLINE void store_unaligned(T* real_dst, T* imag_dst) const noexcept; XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(const value_type* src) noexcept; XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(const value_type* src) noexcept; XSIMD_INLINE void store_aligned(value_type* dst) const noexcept; XSIMD_INLINE void store_unaligned(value_type* dst) const noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, aligned_mode) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, unaligned_mode) noexcept; // Compile-time mask overloads template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, batch_bool_constant mask, Mode = {}) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load(U const* mem, stream_mode) noexcept; template XSIMD_INLINE void store(U* mem, aligned_mode) const noexcept; template XSIMD_INLINE void store(U* mem, unaligned_mode) const noexcept; // Compile-time mask overloads template XSIMD_INLINE void store(U* mem, batch_bool_constant mask, Mode = {}) const noexcept; template XSIMD_INLINE void store(U* mem, stream_mode) const noexcept; XSIMD_INLINE real_batch real() const noexcept; XSIMD_INLINE real_batch imag() const noexcept; XSIMD_INLINE value_type get(std::size_t i) const noexcept; XSIMD_INLINE value_type first() const noexcept; #ifdef XSIMD_ENABLE_XTL_COMPLEX // xtl-related methods template XSIMD_INLINE batch(xtl::xcomplex const& val) noexcept; template XSIMD_INLINE batch(xtl::xcomplex val0, xtl::xcomplex val1, Ts... vals) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load_aligned(const xtl::xcomplex* src) noexcept; template XSIMD_NO_DISCARD static XSIMD_INLINE batch load_unaligned(const xtl::xcomplex* src) noexcept; template XSIMD_INLINE void store_aligned(xtl::xcomplex* dst) const noexcept; template XSIMD_INLINE void store_unaligned(xtl::xcomplex* dst) const noexcept; #endif // comparison operators XSIMD_INLINE batch_bool operator==(batch const& other) const noexcept; XSIMD_INLINE batch_bool operator!=(batch const& other) const noexcept; // Update operators XSIMD_INLINE batch& operator+=(batch const& other) noexcept; XSIMD_INLINE batch& operator-=(batch const& other) noexcept; XSIMD_INLINE batch& operator*=(batch const& other) noexcept; XSIMD_INLINE batch& operator/=(batch const& other) noexcept; // incr/decr operators XSIMD_INLINE batch& operator++() noexcept; XSIMD_INLINE batch& operator--() noexcept; XSIMD_INLINE batch operator++(int) noexcept; XSIMD_INLINE batch operator--(int) noexcept; // unary operators XSIMD_INLINE batch_bool_type operator!() const noexcept; XSIMD_INLINE batch operator~() const noexcept; XSIMD_INLINE batch operator-() const noexcept; XSIMD_INLINE batch operator+() const noexcept; // arithmetic operators. They are defined as friend to enable automatic // conversion of parameters from scalar to batch /** Shorthand for xsimd::add() */ friend XSIMD_INLINE batch operator+(batch const& self, batch const& other) noexcept { return batch(self) += other; } /** Shorthand for xsimd::sub() */ friend XSIMD_INLINE batch operator-(batch const& self, batch const& other) noexcept { return batch(self) -= other; } /** Shorthand for xsimd::mul() */ friend XSIMD_INLINE batch operator*(batch const& self, batch const& other) noexcept { return batch(self) *= other; } /** Shorthand for xsimd::div() */ friend XSIMD_INLINE batch operator/(batch const& self, batch const& other) noexcept { return batch(self) /= other; } private: real_batch m_real; real_batch m_imag; }; #if __cplusplus < 201703L template constexpr std::size_t batch, A>::size; #endif #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct batch, A> { static_assert(std::is_same::value, "Please use batch, A> initialized from xtl::xcomplex instead"); }; #endif } #include "../arch/xsimd_isa.hpp" #include "./xsimd_batch_constant.hpp" #include "./xsimd_traits.hpp" namespace xsimd { /** * Create a batch with all element initialized to \c val. */ template XSIMD_INLINE batch::batch(T val) noexcept : types::simd_register(kernel::broadcast(val, A {})) { detail::static_check_supported_config(); } /** * Create a batch with elements initialized from \c val0, \c val1, \c vals... * There must be exactly \c size elements in total. */ template template XSIMD_INLINE batch::batch(T val0, T val1, Ts... vals) noexcept : batch(kernel::set(batch {}, A {}, val0, val1, static_cast(vals)...)) { detail::static_check_supported_config(); static_assert(sizeof...(Ts) + 2 == size, "The constructor requires as many arguments as batch elements."); } /** * Converts a \c bool_batch to a \c batch where each element is * set to 1 (resp. 0) if the corresponding element is `true` * (resp. `false`). */ template XSIMD_INLINE batch::batch(batch_bool const& b) noexcept : batch(kernel::from_bool(b, A {})) { } /** * Wraps a compatible native simd register as a \c batch. This is generally not needed but * becomes handy when doing architecture-specific operations. */ template XSIMD_INLINE batch::batch(register_type reg) noexcept : types::simd_register({ reg }) { detail::static_check_supported_config(); } /** * Equivalent to batch::batch(T val). */ template template XSIMD_NO_DISCARD XSIMD_INLINE batch batch::broadcast(U val) noexcept { detail::static_check_supported_config(); return batch(static_cast(val)); } /************************** * batch memory operators * **************************/ /** * Copy content of this batch to the buffer \c mem. The * memory needs to be aligned. */ template template XSIMD_INLINE void batch::store_aligned(U* mem) const noexcept { detail::static_check_supported_config(); assert(((reinterpret_cast(mem) % A::alignment()) == 0) && "store location is not properly aligned"); kernel::store_aligned(mem, *this, A {}); } /** * Copy content of this batch to the buffer \c mem. The * memory does not need to be aligned. */ template template XSIMD_INLINE void batch::store_unaligned(U* mem) const noexcept { detail::static_check_supported_config(); kernel::store_unaligned(mem, *this, A {}); } /** * Equivalent to batch::store_aligned() */ template template XSIMD_INLINE void batch::store(U* mem, aligned_mode) const noexcept { detail::static_check_supported_config(); return store_aligned(mem); } /** * Equivalent to batch::store_unaligned() */ template template XSIMD_INLINE void batch::store(U* mem, unaligned_mode) const noexcept { detail::static_check_supported_config(); return store_unaligned(mem); } // masked store free functions are provided in xsimd_api.hpp template template XSIMD_INLINE void batch::store(U* mem, stream_mode) const noexcept { detail::static_check_supported_config(); assert(((reinterpret_cast(mem) % A::alignment()) == 0) && "store location is not properly aligned"); kernel::store_stream(mem, *this, A {}); } /** * Loading from aligned memory. May involve a conversion if \c U is different * from \c T. */ template template XSIMD_INLINE batch batch::load_aligned(U const* mem) noexcept { assert(((reinterpret_cast(mem) % A::alignment()) == 0) && "loaded pointer is not properly aligned"); detail::static_check_supported_config(); return kernel::load_aligned(mem, kernel::convert {}, A {}); } /** * Loading from unaligned memory. May involve a conversion if \c U is different * from \c T. */ template template XSIMD_INLINE batch batch::load_unaligned(U const* mem) noexcept { detail::static_check_supported_config(); return kernel::load_unaligned(mem, kernel::convert {}, A {}); } /** * Equivalent to batch::load_aligned() */ template template XSIMD_INLINE batch batch::load(U const* mem, aligned_mode) noexcept { detail::static_check_supported_config(); return load_aligned(mem); } /** * Equivalent to batch::load_unaligned() */ template template XSIMD_INLINE batch batch::load(U const* mem, unaligned_mode) noexcept { detail::static_check_supported_config(); return load_unaligned(mem); } template template XSIMD_INLINE batch batch::load(U const* mem, batch_bool_constant mask, Mode mode) noexcept { detail::static_check_supported_config(); static_assert(std::is_same::value || std::is_same::value, "supported load mode"); XSIMD_IF_CONSTEXPR(mask.all()) { return load(mem, mode); } else XSIMD_IF_CONSTEXPR(mask.none()) { return broadcast(0); } else { return kernel::load_masked(mem, mask, kernel::convert {}, mode, A {}); } } template template XSIMD_INLINE void batch::store(U* mem, batch_bool_constant mask, Mode mode) const noexcept { detail::static_check_supported_config(); static_assert(std::is_same::value || std::is_same::value, "supported store mode"); XSIMD_IF_CONSTEXPR(mask.none()) { return; } else XSIMD_IF_CONSTEXPR(mask.all()) { store(mem, mode); } else { kernel::store_masked(mem, *this, mask, mode, A {}); } } template template XSIMD_INLINE batch batch::load(U const* mem, stream_mode) noexcept { detail::static_check_supported_config(); assert(((reinterpret_cast(mem) % A::alignment()) == 0) && "loaded pointer is not properly aligned"); return kernel::load_stream(mem, kernel::convert {}, A {}); } /** * Create a new batch gathering elements starting at address \c src and * offset by each element in \c index. * If \c T is not of the same size as \c U, a \c static_cast is performed * at element gather time. */ template template XSIMD_INLINE batch batch::gather(U const* src, batch const& index) noexcept { detail::static_check_supported_config(); static_assert(std::is_convertible::value, "Can't convert from src to this batch's type!"); return kernel::gather(batch {}, src, index, A {}); } /** * Scatter elements from this batch into addresses starting at \c dst * and offset by each element in \c index. * If \c T is not of the same size as \c U, a \c static_cast is performed * at element scatter time. */ template template XSIMD_INLINE void batch::scatter(U* dst, batch const& index) const noexcept { detail::static_check_supported_config(); static_assert(std::is_convertible::value, "Can't convert from this batch's type to dst!"); kernel::scatter(*this, dst, index, A {}); } /** * Retrieve the \c i th scalar element in this batch. * * \c warning This is very inefficient and should only be used for debugging purpose. */ template XSIMD_INLINE T batch::get(std::size_t i) const noexcept { return kernel::get(*this, i, A {}); } /** * Retrieve the first scalar element in this batch. */ template XSIMD_INLINE T batch::first() const noexcept { detail::static_check_supported_config(); return kernel::first(*this, A {}); } /****************************** * batch comparison operators * ******************************/ namespace details { /** * Shorthand for xsimd::eq() */ template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other) noexcept { detail::static_check_supported_config(); return kernel::eq(self, other, A {}); } /** * Shorthand for xsimd::neq() */ template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other) noexcept { detail::static_check_supported_config(); return kernel::neq(self, other, A {}); } /** * Shorthand for xsimd::ge() */ template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other) noexcept { detail::static_check_supported_config(); return kernel::ge(self, other, A {}); } /** * Shorthand for xsimd::le() */ template XSIMD_INLINE batch_bool le(batch const& self, batch const& other) noexcept { detail::static_check_supported_config(); return kernel::le(self, other, A {}); } /** * Shorthand for xsimd::gt() */ template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other) noexcept { detail::static_check_supported_config(); return kernel::gt(self, other, A {}); } /** * Shorthand for xsimd::lt() */ template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other) noexcept { detail::static_check_supported_config(); return kernel::lt(self, other, A {}); } } /************************** * batch update operators * **************************/ template XSIMD_INLINE batch& batch::operator+=(batch const& other) noexcept { detail::static_check_supported_config(); return *this = kernel::add(*this, other, A {}); } template XSIMD_INLINE batch& batch::operator-=(batch const& other) noexcept { detail::static_check_supported_config(); return *this = kernel::sub(*this, other, A {}); } template XSIMD_INLINE batch& batch::operator*=(batch const& other) noexcept { detail::static_check_supported_config(); return *this = kernel::mul(*this, other, A {}); } template XSIMD_INLINE batch& batch::operator/=(batch const& other) noexcept { detail::static_check_supported_config(); return *this = kernel::div(*this, other, A {}); } template XSIMD_INLINE batch& types::integral_only_operators::operator%=(batch const& other) noexcept { ::xsimd::detail::static_check_supported_config(); return *static_cast*>(this) = kernel::mod(*static_cast*>(this), other, A {}); } template XSIMD_INLINE batch& batch::operator&=(batch const& other) noexcept { detail::static_check_supported_config(); return *this = kernel::bitwise_and(*this, other, A {}); } template XSIMD_INLINE batch& batch::operator|=(batch const& other) noexcept { detail::static_check_supported_config(); return *this = kernel::bitwise_or(*this, other, A {}); } template XSIMD_INLINE batch& batch::operator^=(batch const& other) noexcept { detail::static_check_supported_config(); return *this = kernel::bitwise_xor(*this, other, A {}); } template XSIMD_INLINE batch& kernel::integral_only_operators::operator>>=(batch const& other) noexcept { ::xsimd::detail::static_check_supported_config(); return *static_cast*>(this) = kernel::bitwise_rshift(*static_cast*>(this), other, A {}); } template XSIMD_INLINE batch& kernel::integral_only_operators::operator<<=(batch const& other) noexcept { ::xsimd::detail::static_check_supported_config(); return *static_cast*>(this) = kernel::bitwise_lshift(*static_cast*>(this), other, A {}); } template XSIMD_INLINE batch& kernel::integral_only_operators::operator>>=(int32_t other) noexcept { ::xsimd::detail::static_check_supported_config(); return *static_cast*>(this) = kernel::bitwise_rshift(*static_cast*>(this), other, A {}); } template XSIMD_INLINE batch& kernel::integral_only_operators::operator<<=(int32_t other) noexcept { ::xsimd::detail::static_check_supported_config(); return *static_cast*>(this) = kernel::bitwise_lshift(*static_cast*>(this), other, A {}); } /***************************** * batch incr/decr operators * *****************************/ template XSIMD_INLINE batch& batch::operator++() noexcept { detail::static_check_supported_config(); return operator+=(1); } template XSIMD_INLINE batch& batch::operator--() noexcept { detail::static_check_supported_config(); return operator-=(1); } template XSIMD_INLINE batch batch::operator++(int) noexcept { detail::static_check_supported_config(); batch copy(*this); operator+=(1); return copy; } template XSIMD_INLINE batch batch::operator--(int) noexcept { detail::static_check_supported_config(); batch copy(*this); operator-=(1); return copy; } /************************* * batch unary operators * *************************/ template XSIMD_INLINE batch_bool batch::operator!() const noexcept { detail::static_check_supported_config(); return kernel::eq(*this, batch(0), A {}); } template XSIMD_INLINE batch batch::operator~() const noexcept { detail::static_check_supported_config(); return kernel::bitwise_not(*this, A {}); } template XSIMD_INLINE batch batch::operator-() const noexcept { detail::static_check_supported_config(); return kernel::neg(*this, A {}); } template XSIMD_INLINE batch batch::operator+() const noexcept { detail::static_check_supported_config(); return *this; } /************************ * batch private method * ************************/ template XSIMD_INLINE batch batch::logical_and(batch const& other) const noexcept { return kernel::logical_and(*this, other, A()); } template XSIMD_INLINE batch batch::logical_or(batch const& other) const noexcept { return kernel::logical_or(*this, other, A()); } /*************************** * batch_bool constructors * ***************************/ template XSIMD_INLINE batch_bool::batch_bool(register_type reg) noexcept : types::get_bool_simd_register_t({ reg }) { } template template XSIMD_INLINE batch_bool::batch_bool(bool val0, bool val1, Ts... vals) noexcept : batch_bool(kernel::set(batch_bool {}, A {}, val0, val1, static_cast(vals)...)) { static_assert(sizeof...(Ts) + 2 == size, "The constructor requires as many arguments as batch elements."); } /******************************* * batch_bool memory operators * *******************************/ template XSIMD_INLINE void batch_bool::store_aligned(bool* mem) const noexcept { kernel::store(*this, mem, A {}); } template XSIMD_INLINE void batch_bool::store_unaligned(bool* mem) const noexcept { store_aligned(mem); } template XSIMD_INLINE void batch_bool::store_stream(bool* mem) const noexcept { kernel::store_stream(*this, mem, A {}); } template XSIMD_INLINE batch_bool batch_bool::load_aligned(bool const* mem) noexcept { return kernel::load_aligned(mem, batch_bool(), A {}); } template XSIMD_INLINE batch_bool batch_bool::load_unaligned(bool const* mem) noexcept { return kernel::load_unaligned(mem, batch_bool(), A {}); } template XSIMD_INLINE batch_bool batch_bool::load_stream(bool const* mem) noexcept { return kernel::load_stream(mem, batch_bool(), A {}); } /** * Extract a scalar mask representation from this @c batch_bool. * * @return bit mask */ template XSIMD_INLINE uint64_t batch_bool::mask() const noexcept { return kernel::mask(*this, A {}); } /** * Extract a scalar mask representation from this @c batch_bool. * * @return bit mask */ template XSIMD_INLINE batch_bool batch_bool::from_mask(uint64_t mask) noexcept { return kernel::from_mask(batch_bool(), mask, A {}); } template XSIMD_INLINE bool batch_bool::get(std::size_t i) const noexcept { return kernel::get(*this, i, A {}); } template XSIMD_INLINE bool batch_bool::first() const noexcept { detail::static_check_supported_config(); return kernel::first(*this, A {}); } /*********************************** * batch_bool comparison operators * ***********************************/ template XSIMD_INLINE batch_bool batch_bool::operator==(batch_bool const& other) const noexcept { return kernel::eq(*this, other, A {}).data; } template XSIMD_INLINE batch_bool batch_bool::operator!=(batch_bool const& other) const noexcept { return kernel::neq(*this, other, A {}).data; } /******************************** * batch_bool logical operators * ********************************/ template XSIMD_INLINE batch_bool batch_bool::operator~() const noexcept { return kernel::bitwise_not(*this, A {}).data; } template XSIMD_INLINE batch_bool batch_bool::operator!() const noexcept { return operator==(batch_bool(false)); } template XSIMD_INLINE batch_bool batch_bool::operator&(batch_bool const& other) const noexcept { return kernel::bitwise_and(*this, other, A {}).data; } template XSIMD_INLINE batch_bool batch_bool::operator|(batch_bool const& other) const noexcept { return kernel::bitwise_or(*this, other, A {}).data; } template XSIMD_INLINE batch_bool batch_bool::operator^(batch_bool const& other) const noexcept { return kernel::bitwise_xor(*this, other, A {}).data; } template XSIMD_INLINE batch_bool batch_bool::operator&&(batch_bool const& other) const noexcept { return operator&(other); } template XSIMD_INLINE batch_bool batch_bool::operator||(batch_bool const& other) const noexcept { return operator|(other); } /****************************** * batch_bool private methods * ******************************/ template XSIMD_INLINE batch_bool::batch_bool(bool val) noexcept : base_type { make_register(std::make_index_sequence(), val) } { } template template XSIMD_INLINE auto batch_bool::make_register(std::index_sequence, U u, V... v) noexcept -> register_type { return make_register(std::index_sequence(), u, u, v...); } template template XSIMD_INLINE auto batch_bool::make_register(std::index_sequence<>, V... v) noexcept -> register_type { return kernel::set(batch_bool(), A {}, v...).data; } /******************************* * batch constructors * *******************************/ template XSIMD_INLINE batch, A>::batch(value_type const& val) noexcept : m_real(val.real()) , m_imag(val.imag()) { } template XSIMD_INLINE batch, A>::batch(real_batch const& real, real_batch const& imag) noexcept : m_real(real) , m_imag(imag) { } template XSIMD_INLINE batch, A>::batch(real_batch const& real) noexcept : m_real(real) , m_imag(0) { } template XSIMD_INLINE batch, A>::batch(T val) noexcept : m_real(val) , m_imag(0) { } template template XSIMD_INLINE batch, A>::batch(value_type val0, value_type val1, Ts... vals) noexcept : batch(kernel::set(batch {}, A {}, val0, val1, static_cast(vals)...)) { static_assert(sizeof...(Ts) + 2 == size, "as many arguments as batch elements"); } template XSIMD_INLINE batch, A>::batch(batch_bool_type const& b) noexcept : m_real(b) , m_imag(0) { } template template XSIMD_NO_DISCARD XSIMD_INLINE batch, A> batch, A>::broadcast(U val) noexcept { return batch(static_cast>(val)); } /*********************************** * batch memory operators * ***********************************/ template XSIMD_INLINE batch, A> batch, A>::load_aligned(const T* real_src, const T* imag_src) noexcept { return { batch::load_aligned(real_src), imag_src ? batch::load_aligned(imag_src) : batch(0) }; } template XSIMD_INLINE batch, A> batch, A>::load_unaligned(const T* real_src, const T* imag_src) noexcept { return { batch::load_unaligned(real_src), imag_src ? batch::load_unaligned(imag_src) : batch(0) }; } template XSIMD_INLINE batch, A> batch, A>::load_aligned(const value_type* src) noexcept { assert(((reinterpret_cast(src) % A::alignment()) == 0) && "loaded pointer is not properly aligned"); return kernel::load_complex_aligned(src, kernel::convert {}, A {}); } template XSIMD_INLINE batch, A> batch, A>::load_unaligned(const value_type* src) noexcept { return kernel::load_complex_unaligned(src, kernel::convert {}, A {}); } template XSIMD_INLINE void batch, A>::store_aligned(value_type* dst) const noexcept { assert(((reinterpret_cast(dst) % A::alignment()) == 0) && "store location is not properly aligned"); return kernel::store_complex_aligned(dst, *this, A {}); } template XSIMD_INLINE void batch, A>::store_unaligned(value_type* dst) const noexcept { return kernel::store_complex_unaligned(dst, *this, A {}); } // Compile-time mask overloads for complex store template template XSIMD_INLINE void batch, A>::store(U* mem, batch_bool_constant mask, Mode mode) const noexcept { kernel::store_masked(mem, *this, mask, mode, A {}); } template XSIMD_INLINE void batch, A>::store_aligned(T* real_dst, T* imag_dst) const noexcept { m_real.store_aligned(real_dst); m_imag.store_aligned(imag_dst); } template XSIMD_INLINE void batch, A>::store_unaligned(T* real_dst, T* imag_dst) const noexcept { m_real.store_unaligned(real_dst); m_imag.store_unaligned(imag_dst); } template template XSIMD_INLINE batch, A> batch, A>::load(U const* mem, aligned_mode) noexcept { return load_aligned(mem); } template template XSIMD_INLINE batch, A> batch, A>::load(U const* mem, unaligned_mode) noexcept { return load_unaligned(mem); } // Compile-time mask overloads for complex load template template XSIMD_INLINE batch, A> batch, A>::load(U const* mem, batch_bool_constant mask, Mode mode) noexcept { return kernel::load_masked(mem, mask, kernel::convert {}, mode, A {}); } template template XSIMD_INLINE batch, A> batch, A>::load(U const* mem, stream_mode) noexcept { assert(((reinterpret_cast(mem) % A::alignment()) == 0) && "loaded pointer is not properly aligned"); auto* ptr = reinterpret_cast(mem); return kernel::load_complex_stream(ptr, kernel::convert {}, A {}); } template template XSIMD_INLINE void batch, A>::store(U* mem, aligned_mode) const noexcept { return store_aligned(mem); } template template XSIMD_INLINE void batch, A>::store(U* mem, unaligned_mode) const noexcept { return store_unaligned(mem); } template template XSIMD_INLINE void batch, A>::store(U* mem, stream_mode) const noexcept { assert(((reinterpret_cast(mem) % A::alignment()) == 0) && "store location is not properly aligned"); auto* ptr = reinterpret_cast(mem); return kernel::store_complex_stream(ptr, *this, A {}); } template XSIMD_INLINE auto batch, A>::real() const noexcept -> real_batch { return m_real; } template XSIMD_INLINE auto batch, A>::imag() const noexcept -> real_batch { return m_imag; } template XSIMD_INLINE auto batch, A>::get(std::size_t i) const noexcept -> value_type { return kernel::get(*this, i, A {}); } template XSIMD_INLINE auto batch, A>::first() const noexcept -> value_type { detail::static_check_supported_config, A>(); return kernel::first(*this, A {}); } /************************************** * batch xtl-related methods * **************************************/ #ifdef XSIMD_ENABLE_XTL_COMPLEX template template XSIMD_INLINE batch, A>::batch(xtl::xcomplex const& val) noexcept : m_real(val.real()) , m_imag(val.imag()) { } template template XSIMD_INLINE batch, A>::batch(xtl::xcomplex val0, xtl::xcomplex val1, Ts... vals) noexcept : batch(kernel::set(batch {}, A {}, val0, val1, static_cast>(vals)...)) { static_assert(sizeof...(Ts) + 2 == size, "as many arguments as batch elements"); } // Memory layout of an xcomplex and std::complex are the same when xcomplex // stores values and not reference. Unfortunately, this breaks strict // aliasing... template template XSIMD_INLINE batch, A> batch, A>::load_aligned(const xtl::xcomplex* src) noexcept { return load_aligned(reinterpret_cast const*>(src)); } template template XSIMD_INLINE batch, A> batch, A>::load_unaligned(const xtl::xcomplex* src) noexcept { return load_unaligned(reinterpret_cast const*>(src)); } template template XSIMD_INLINE void batch, A>::store_aligned(xtl::xcomplex* dst) const noexcept { store_aligned(reinterpret_cast*>(dst)); } template template XSIMD_INLINE void batch, A>::store_unaligned(xtl::xcomplex* dst) const noexcept { store_unaligned(reinterpret_cast*>(dst)); } #endif /*************************************** * batch comparison operators * ***************************************/ template XSIMD_INLINE batch_bool batch, A>::operator==(batch const& other) const noexcept { return m_real == other.m_real && m_imag == other.m_imag; } template XSIMD_INLINE batch_bool batch, A>::operator!=(batch const& other) const noexcept { return m_real != other.m_real || m_imag != other.m_imag; } /*********************************** * batch update operators * ***********************************/ template XSIMD_INLINE batch, A>& batch, A>::operator+=(batch const& other) noexcept { m_real += other.m_real; m_imag += other.m_imag; return *this; } template XSIMD_INLINE batch, A>& batch, A>::operator-=(batch const& other) noexcept { m_real -= other.m_real; m_imag -= other.m_imag; return *this; } template XSIMD_INLINE batch, A>& batch, A>::operator*=(batch const& other) noexcept { real_batch new_real = fms(real(), other.real(), imag() * other.imag()); real_batch new_imag = fma(real(), other.imag(), imag() * other.real()); m_real = new_real; m_imag = new_imag; return *this; } template XSIMD_INLINE batch, A>& batch, A>::operator/=(batch const& other) noexcept { real_batch a = real(); real_batch b = imag(); real_batch c = other.real(); real_batch d = other.imag(); real_batch e = c * c + d * d; m_real = (c * a + d * b) / e; m_imag = (c * b - d * a) / e; return *this; } /************************************** * batch incr/decr operators * **************************************/ template XSIMD_INLINE batch, A>& batch, A>::operator++() noexcept { return operator+=(1); } template XSIMD_INLINE batch, A>& batch, A>::operator--() noexcept { return operator-=(1); } template XSIMD_INLINE batch, A> batch, A>::operator++(int) noexcept { batch copy(*this); operator+=(1); return copy; } template XSIMD_INLINE batch, A> batch, A>::operator--(int) noexcept { batch copy(*this); operator-=(1); return copy; } /********************************** * batch unary operators * **********************************/ template XSIMD_INLINE batch_bool batch, A>::operator!() const noexcept { return operator==(batch(0)); } template XSIMD_INLINE batch, A> batch, A>::operator~() const noexcept { return { ~m_real, ~m_imag }; } template XSIMD_INLINE batch, A> batch, A>::operator-() const noexcept { return { -m_real, -m_imag }; } template XSIMD_INLINE batch, A> batch, A>::operator+() const noexcept { return { +m_real, +m_imag }; } /********************************** * size type aliases **********************************/ namespace details { template struct sized_batch; template struct sized_batch> { using type = void; }; template ::value> struct batch_trait; template struct batch_trait { using type = xsimd::batch; static constexpr std::size_t size = xsimd::batch::size; }; template struct batch_trait { using type = void; static constexpr std::size_t size = 0; }; template struct sized_batch> { using type = std::conditional_t< batch_trait::size == N, typename batch_trait::type, typename sized_batch>::type>; }; } /** * @brief type utility to select a batch of given type and size * * If one of the available architectures has a native vector type of the * given type and size, sets the @p type member to the appropriate batch * type. Otherwise set its to @p void. * * @tparam T the type of the underlying values. * @tparam N the number of elements of that type in the batch. **/ template struct make_sized_batch { using type = typename details::sized_batch::type; }; template using make_sized_batch_t = typename make_sized_batch::type; } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_batch_constant.hpp000066400000000000000000000545671517435117100265720ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_BATCH_CONSTANT_HPP #define XSIMD_BATCH_CONSTANT_HPP #include #include #include #include "./xsimd_batch.hpp" #include "./xsimd_utils.hpp" namespace xsimd { /** * @brief batch of boolean constant * * Abstract representation of a batch of boolean constants. * * @tparam batch_type the type of the associated batch values. * @tparam Values boolean constant represented by this batch **/ template struct batch_bool_constant { using batch_type = batch_bool; static constexpr std::size_t size = sizeof...(Values); using value_type = bool; using operand_type = T; static_assert(sizeof...(Values) == batch_type::size, "consistent batch size"); public: /** * @brief Generate a batch of @p batch_type from this @p batch_bool_constant */ constexpr batch_type as_batch_bool() const noexcept { return { Values... }; } /** * @brief Generate a batch of @p integers from this @p batch_bool_constant */ constexpr batch, A> as_batch() const noexcept { return { -as_integer_t(Values)... }; } // the minus is important! /** * @brief Generate a batch of @p batch_type from this @p batch_bool_constant */ constexpr operator batch_type() const noexcept { return as_batch_bool(); } constexpr bool get(std::size_t i) const noexcept { return std::array { { Values... } }[i]; } static constexpr int mask() noexcept { return mask_helper(0, static_cast(Values)...); } static constexpr bool none() noexcept { return truncated_mask() == 0u; } static constexpr bool any() noexcept { return !none(); } static constexpr bool all() noexcept { return truncated_mask() == low_mask(size); } static constexpr std::size_t countr_zero() noexcept { return countr_zero_impl(truncated_mask(), size); } static constexpr std::size_t countl_zero() noexcept { return countl_zero_impl(truncated_mask(), size); } static constexpr std::size_t countr_one() noexcept { return countr_one_impl(truncated_mask(), size); } static constexpr std::size_t countl_one() noexcept { return countl_one_impl(truncated_mask(), size); } private: static constexpr int mask_helper(int acc) noexcept { return acc; } template static constexpr int mask_helper(int acc, int mask, Tys... masks) noexcept { return mask_helper(acc | mask, (masks << 1)...); } struct logical_or { constexpr bool operator()(bool x, bool y) const { return x || y; } }; struct logical_and { constexpr bool operator()(bool x, bool y) const { return x && y; } }; struct logical_xor { constexpr bool operator()(bool x, bool y) const { return x ^ y; } }; template static constexpr batch_bool_constant::type::value, std::tuple_element::type::value)...> apply(std::index_sequence) { return {}; } template static constexpr auto apply(batch_bool_constant, batch_bool_constant) { static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches"); return apply...>, std::tuple...>>(std::make_index_sequence()); } public: #define MAKE_BINARY_OP(OP, NAME) \ template \ constexpr auto operator OP(batch_bool_constant other) const \ { \ return apply(*this, other); \ } MAKE_BINARY_OP(|, logical_or) MAKE_BINARY_OP(||, logical_or) MAKE_BINARY_OP(&, logical_and) MAKE_BINARY_OP(&&, logical_and) MAKE_BINARY_OP(^, logical_xor) #undef MAKE_BINARY_OP constexpr batch_bool_constant operator!() const { return {}; } constexpr batch_bool_constant operator~() const { return {}; } private: // Build a 64-bit mask from Values... (LSB = index 0) template struct build_bits_helper; template struct build_bits_helper { static constexpr uint64_t value = 0u; }; template struct build_bits_helper { static constexpr uint64_t value = (Current ? (uint64_t(1) << I) : 0u) | build_bits_helper::value; }; static constexpr uint64_t bits() noexcept { return build_bits_helper<0, Values...>::value; } static constexpr uint64_t low_mask(std::size_t k) noexcept { return (k >= 64u) ? ~uint64_t(0) : ((uint64_t(1) << k) - 1u); } static constexpr uint64_t truncated_mask() noexcept { return bits() & low_mask(size); } static constexpr std::size_t countr_zero_impl(uint64_t v, std::size_t n) noexcept { return (n == 0 || (v & 1u) != 0u) ? 0u : (1u + countr_zero_impl(v >> 1, n - 1)); } static constexpr std::size_t countr_one_impl(uint64_t v, std::size_t n) noexcept { return (n == 0 || (v & 1u) == 0u) ? 0u : (1u + countr_one_impl(v >> 1, n - 1)); } static constexpr std::size_t countl_zero_impl(uint64_t v, std::size_t n) noexcept { return (n == 0) ? 0u : ((((v >> (n - 1)) & 1u) != 0u) ? 0u : (1u + countl_zero_impl(v, n - 1))); } static constexpr std::size_t countl_one_impl(uint64_t v, std::size_t n) noexcept { return (n == 0) ? 0u : ((((v >> (n - 1)) & 1u) == 0u) ? 0u : (1u + countl_one_impl(v, n - 1))); } }; namespace detail { template XSIMD_INLINE constexpr batch_bool_constant< T, A2, std::tuple_element...>>::type::value...> splice_impl(std::index_sequence) noexcept { return {}; } template = Begin ? (End - Begin) : 0)> XSIMD_INLINE constexpr auto splice(batch_bool_constant) noexcept { static_assert(Begin <= End, "splice: Begin must be <= End"); static_assert(End <= sizeof...(Values), "splice: End must be <= size"); static_assert(N == batch_bool::size, "splice: target arch size must match submask length"); return splice_impl(std::make_index_sequence()); } template XSIMD_INLINE constexpr auto lower_half(batch_bool_constant) noexcept { static_assert(sizeof...(Values) % 2 == 0, "lower_half requires even size"); static_assert(batch_bool::size == sizeof...(Values) / 2, "lower_half: target arch size must match submask length"); return splice_impl(std::make_index_sequence()); } template XSIMD_INLINE constexpr auto upper_half(batch_bool_constant) noexcept { static_assert(sizeof...(Values) % 2 == 0, "upper_half requires even size"); static_assert(batch_bool::size == sizeof...(Values) / 2, "upper_half: target arch size must match submask length"); return splice_impl(std::make_index_sequence()); } } // namespace detail /** * @brief batch of integral constants * * Abstract representation of a batch of integral constants. * * @tparam batch_type the type of the associated batch values. * @tparam Values constants represented by this batch **/ template struct batch_constant { static constexpr std::size_t size = sizeof...(Values); using batch_type = batch; using value_type = typename batch_type::value_type; static_assert(sizeof...(Values) == batch_type::size, "consistent batch size"); /** * @brief Generate a batch of @p batch_type from this @p batch_constant */ XSIMD_INLINE batch_type as_batch() const noexcept { return { Values... }; } /** * @brief Generate a batch of @p batch_type from this @p batch_constant */ XSIMD_INLINE operator batch_type() const noexcept { return as_batch(); } /** * @brief Get the @p i th element of this @p batch_constant */ constexpr T get(std::size_t i) const noexcept { return get(i, std::array { Values... }); } private: constexpr T get(std::size_t i, std::array const& values) const noexcept { return values[i]; } template struct binary_rshift { constexpr T operator()(T x, T y) const { return x >> y; } }; template struct binary_lshift { constexpr T operator()(T x, T y) const { return x << y; } }; template static constexpr batch_constant::type::value, std::tuple_element::type::value)...> apply(std::index_sequence) { return {}; } template static constexpr auto apply(batch_constant, batch_constant) { static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches"); return apply...>, std::tuple...>>(std::make_index_sequence()); } public: #define MAKE_BINARY_OP(OP, NAME) \ template \ constexpr auto operator OP(batch_constant other) const \ { \ return apply>(*this, other); \ } \ template \ constexpr batch_constant operator OP(std::integral_constant) const \ { \ return {}; \ } MAKE_BINARY_OP(+, std::plus) MAKE_BINARY_OP(-, std::minus) MAKE_BINARY_OP(*, std::multiplies) MAKE_BINARY_OP(/, std::divides) MAKE_BINARY_OP(%, std::modulus) MAKE_BINARY_OP(&, std::bit_and) MAKE_BINARY_OP(|, std::bit_or) MAKE_BINARY_OP(^, std::bit_xor) MAKE_BINARY_OP(<<, binary_lshift) MAKE_BINARY_OP(>>, binary_rshift) #undef MAKE_BINARY_OP template static constexpr batch_bool_constant::type::value, std::tuple_element::type::value)...> apply_bool(std::index_sequence) { return {}; } template static constexpr auto apply_bool(batch_constant, batch_constant) { static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches"); return apply_bool...>, std::tuple...>>(std::make_index_sequence()); } #define MAKE_BINARY_BOOL_OP(OP, NAME) \ template \ constexpr auto operator OP(batch_constant other) const \ { \ return apply_bool>(*this, other); \ } \ template \ constexpr batch_bool_constant operator OP(std::integral_constant) const \ { \ return {}; \ } MAKE_BINARY_BOOL_OP(==, std::equal_to) MAKE_BINARY_BOOL_OP(!=, std::not_equal_to) MAKE_BINARY_BOOL_OP(<, std::less) MAKE_BINARY_BOOL_OP(<=, std::less_equal) MAKE_BINARY_BOOL_OP(>, std::greater) MAKE_BINARY_BOOL_OP(>=, std::greater_equal) #undef MAKE_BINARY_BOOL_OP constexpr batch_constant operator-() const { return {}; } constexpr batch_constant operator+() const { return {}; } constexpr batch_constant operator~() const { return {}; } }; namespace detail { template XSIMD_INLINE constexpr batch_constant make_batch_constant(std::index_sequence) noexcept { return {}; } template XSIMD_INLINE constexpr batch_constant(0 * Is) + Val)...> make_batch_constant(std::index_sequence) noexcept { return {}; } #if __cplusplus >= 202002L template XSIMD_INLINE constexpr batch_constant make_batch_constant(std::index_sequence) noexcept { return {}; } #endif template XSIMD_INLINE constexpr batch_bool_constant make_batch_bool_constant(std::index_sequence) noexcept { return {}; } template XSIMD_INLINE constexpr batch_bool_constant(Is) | true) & Val)...> make_batch_bool_constant(std::index_sequence) noexcept { return {}; } #if __cplusplus >= 202002L template XSIMD_INLINE constexpr batch_bool_constant make_batch_bool_constant(std::index_sequence) noexcept { return {}; } #endif } // namespace detail /** * @brief Build a @c batch_constant out of a generator function * * @tparam batch_type type of the (non-constant) batch to build * @tparam G type used to generate that batch. That type must have a static * member @c get that's used to generate the batch constant. Conversely, the * generated batch_constant has value `{G::get(0, batch_size), ... , G::get(batch_size - 1, batch_size)}` * * The following generator produces a batch of `(n - 1, 0, 1, ... n-2)` * * @code * struct Rot * { * static constexpr unsigned get(unsigned i, unsigned n) * { * return (i + n - 1) % n; * } * }; * @endcode */ template XSIMD_INLINE constexpr decltype(detail::make_batch_constant(std::make_index_sequence::size>())) make_batch_constant() noexcept { return {}; } /** * @brief Build a @c batch_bool_constant out of a generator function * * Similar to @c make_batch_constant for @c batch_bool_constant */ template XSIMD_INLINE constexpr decltype(detail::make_batch_bool_constant(std::make_index_sequence::size>())) make_batch_bool_constant() noexcept { return {}; } // FIXME: Skipping those for doxygen because of bad interaction with breathe. #ifndef DOXYGEN_SHOULD_SKIP_THIS /** * @brief Build a @c batch_constant with a single repeated value. * * @tparam T type of the data held in the batch. * @tparam Val The value to repeat. * @tparam A Architecture that will be used when converting to a regular batch. */ template XSIMD_INLINE constexpr decltype(detail::make_batch_constant(std::make_index_sequence::size>())) make_batch_constant() noexcept { return {}; } #if __cplusplus >= 202002L /** * @brief Build a @c batch_constant from a std::array (C++20) * * @tparam Arr The std::array containing the values (non type template argument). * @tparam A Architecture that will be used when converting to a regular batch. */ template requires(Arr.size() == batch::size) XSIMD_INLINE constexpr auto make_batch_constant() noexcept { return detail::make_batch_constant(std::make_index_sequence()); } #endif /* * @brief Build a @c batch_bool_constant with a single repeated value. * * Similar to @c make_batch_constant for @c batch_bool_constant */ template XSIMD_INLINE constexpr decltype(detail::make_batch_bool_constant(std::make_index_sequence::size>())) make_batch_bool_constant() noexcept { return {}; } #if __cplusplus >= 202002L /** * @brief Build a @c batch_constant from a std::array of boolean (C++20) * * @tparam Arr The std::array containing the boolean values (non type template argument). * @tparam A Architecture that will be used when converting to a regular batch. */ template requires( (Arr.size() == batch_bool::size) && std::is_same_v) XSIMD_INLINE constexpr auto make_batch_bool_constant() noexcept { return detail::make_batch_bool_constant(std::make_index_sequence()); } #endif #endif namespace generator { template struct iota { static constexpr T get(size_t index, size_t) { return static_cast(index); } }; } /** * @brief Build a @c batch_constant as an enumerated range * * @tparam T type of the data held in the batch. * @tparam A Architecture that will be used when converting to a regular batch. */ template XSIMD_INLINE constexpr auto make_iota_batch_constant() noexcept { return make_batch_constant, A>(); } } // namespace xsimd #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_batch_fwd.hpp000066400000000000000000000030451517435117100255020ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_BATCH_FWD_HPP #define XSIMD_BATCH_FWD_HPP #include "../config/xsimd_config.hpp" // TODO this is somehow redundant with XSIMD_DEFAULT_ARCH but is only supported // when an architecture is defined. #if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) #define XSIMD_BATCH_DEFAULT_ARCH_IMPL void #else #include "../config/xsimd_arch.hpp" #define XSIMD_BATCH_DEFAULT_ARCH_IMPL default_arch #endif // XSIMD_NO_SUPPORTED_ARCHITECTURE namespace xsimd { template class batch_bool; template struct batch_bool_constant; template class batch; template struct batch_constant; } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_common_arch.hpp000066400000000000000000000035011517435117100260430ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_COMMON_ARCH_HPP #define XSIMD_COMMON_ARCH_HPP #include #include "../config/xsimd_config.hpp" /** * @defgroup architectures Architecture description * */ namespace xsimd { /** * @ingroup architectures * * Base class for all architectures. */ struct common { /// Whether this architecture is supported at compile-time. static constexpr bool supported() noexcept { return true; } /// Whether this architecture is available at run-time. static constexpr bool available() noexcept { return true; } /// If this architectures supports aligned memory accesses, the required /// alignment. static constexpr std::size_t alignment() noexcept { return 0; } /// Whether this architecture requires aligned memory access. static constexpr bool requires_alignment() noexcept { return false; } /// Name of the architecture. static constexpr char const* name() noexcept { return "common"; } }; struct unsupported { }; } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_emulated_register.hpp000066400000000000000000000056161517435117100272730ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_EMULATED_REGISTER_HPP #define XSIMD_EMULATED_REGISTER_HPP #include "./xsimd_common_arch.hpp" #include "./xsimd_register.hpp" namespace xsimd { /** * @ingroup architectures * * emulated instructions */ template struct emulated : common { static constexpr bool supported() noexcept { return true; } static constexpr bool available() noexcept { return true; } static constexpr bool requires_alignment() noexcept { return false; } static constexpr std::size_t alignment() noexcept { return 8; } static constexpr char const* name() noexcept { return "emulated"; } }; namespace types { template struct simd_emulated_bool_register { using register_type = std::array; register_type data; simd_emulated_bool_register() = default; simd_emulated_bool_register(register_type r) { data = r; } operator register_type() const noexcept { return data; } }; template struct get_bool_simd_register> { using type = simd_emulated_bool_register; }; template struct simd_register> { static_assert(N % (8 * sizeof(T)) == 0, "bit width must be a multiple of scalar width"); using register_type = std::array; register_type data; XSIMD_INLINE operator register_type() const noexcept { return data; } }; template struct has_simd_register> : std::is_scalar { }; template struct has_simd_register, emulated> : std::true_type { }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct has_simd_register, emulated> : std::true_type { }; #endif } } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_fma3_avx2_register.hpp000066400000000000000000000030271517435117100272530ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_AVX2_REGISTER_HPP #define XSIMD_FMA3_AVX2_REGISTER_HPP #include "./xsimd_avx2_register.hpp" namespace xsimd { template struct fma3; /** * @ingroup architectures * * AVX2 + FMA instructions */ template <> struct fma3 : avx2 { static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX2; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "fma3+avx2"; } }; #if XSIMD_WITH_FMA3_AVX2 #if !XSIMD_WITH_AVX2 #error "architecture inconsistency: fma3+avx2 requires avx2" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3, avx2); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_fma3_avx_register.hpp000066400000000000000000000030111517435117100271620ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_AVX_REGISTER_HPP #define XSIMD_FMA3_AVX_REGISTER_HPP #include "./xsimd_avx_register.hpp" namespace xsimd { template struct fma3; /** * @ingroup architectures * * AVX + FMA instructions */ template <> struct fma3 : avx { static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "fma3+avx"; } }; #if XSIMD_WITH_FMA3_AVX #if !XSIMD_WITH_AVX #error "architecture inconsistency: fma3+avx requires avx" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3, avx); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_fma3_sse_register.hpp000066400000000000000000000030471517435117100271670ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_SSE_REGISTER_HPP #define XSIMD_FMA3_SSE_REGISTER_HPP #include "./xsimd_sse4_2_register.hpp" namespace xsimd { template struct fma3; /** * @ingroup architectures * * SSE4.2 + FMA instructions */ template <> struct fma3 : sse4_2 { static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_SSE; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "fma3+sse4.2"; } }; #if XSIMD_WITH_FMA3_SSE #if !XSIMD_WITH_SSE4_2 #error "architecture inconsistency: fma3+sse4.2 requires sse4.2" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3, sse4_2); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_fma4_register.hpp000066400000000000000000000027561517435117100263240ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA4_REGISTER_HPP #define XSIMD_FMA4_REGISTER_HPP #include "./xsimd_sse4_2_register.hpp" #if XSIMD_WITH_FMA4 #include #endif namespace xsimd { /** * @ingroup architectures * * SSE4.2 + FMA4 instructions */ struct fma4 : sse4_2 { static constexpr bool supported() noexcept { return XSIMD_WITH_FMA4; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "fma4"; } }; #if XSIMD_WITH_FMA4 #if !XSIMD_WITH_SSE4_2 #error "architecture inconsistency: fma4 requires sse4.2" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma4, sse4_2); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_i8mm_neon64_register.hpp000066400000000000000000000033321517435117100275270ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_I8MM_NEON64_REGISTER_HPP #define XSIMD_I8MM_NEON64_REGISTER_HPP #include "./xsimd_neon64_register.hpp" namespace xsimd { template struct i8mm; /** * @ingroup architectures * * Neon64 + i8mm instructions */ template <> struct i8mm : neon64 { static constexpr bool supported() noexcept { return XSIMD_WITH_I8MM_NEON64; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "i8mm+neon64"; } }; #if XSIMD_WITH_I8MM_NEON64 #if !XSIMD_WITH_NEON64 #error "architecture inconsistency: i8mm+neon64 requires neon64" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(i8mm, neon64); template struct get_bool_simd_register> : detail::neon_bool_simd_register> { }; } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_neon64_register.hpp000066400000000000000000000034671517435117100266060ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON64_REGISTER_HPP #define XSIMD_NEON64_REGISTER_HPP #include "xsimd_neon_register.hpp" namespace xsimd { /** * @ingroup architectures * * NEON instructions for arm64 */ struct neon64 : neon { static constexpr bool supported() noexcept { return XSIMD_WITH_NEON64; } static constexpr bool available() noexcept { return true; } static constexpr bool requires_alignment() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 16; } static constexpr char const* name() noexcept { return "arm64+neon"; } }; #if XSIMD_WITH_NEON64 #if !XSIMD_WITH_NEON #error "architecture inconsistency: neon64 requires neon" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(neon64, neon); XSIMD_DECLARE_SIMD_REGISTER(double, neon64, float64x2_t); template struct get_bool_simd_register : detail::neon_bool_simd_register { }; } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_neon_register.hpp000066400000000000000000000116231517435117100264250ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_REGISTER_HPP #define XSIMD_NEON_REGISTER_HPP #include "../config/xsimd_config.hpp" #include "../utils/xsimd_type_traits.hpp" #include "./xsimd_common_arch.hpp" #include "./xsimd_register.hpp" #if XSIMD_WITH_NEON #if defined(_WIN32) && XSIMD_WITH_NEON64 #include #else #include #endif #endif namespace xsimd { /** * @ingroup architectures * * NEON instructions for arm32 */ struct neon : common { static constexpr bool supported() noexcept { return XSIMD_WITH_NEON; } static constexpr bool available() noexcept { return true; } static constexpr bool requires_alignment() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 16; } static constexpr char const* name() noexcept { return "arm32+neon"; } }; #if XSIMD_WITH_NEON namespace types { namespace detail { template struct neon_vector_type_impl; template <> struct neon_vector_type_impl<8> { using signed_type = int8x16_t; using unsigned_type = uint8x16_t; }; template <> struct neon_vector_type_impl<16> { using signed_type = int16x8_t; using unsigned_type = uint16x8_t; }; template <> struct neon_vector_type_impl<32> { using signed_type = int32x4_t; using unsigned_type = uint32x4_t; }; template <> struct neon_vector_type_impl<64> { using signed_type = int64x2_t; using unsigned_type = uint64x2_t; }; template using signed_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::signed_type; template using unsigned_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::unsigned_type; template using neon_vector_type = std::conditional_t::value, signed_neon_vector_type, unsigned_neon_vector_type>; using char_neon_vector_type = std::conditional_t::value, signed_neon_vector_type, unsigned_neon_vector_type>; } XSIMD_DECLARE_SIMD_REGISTER(signed char, neon, detail::neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned char, neon, detail::neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(char, neon, detail::char_neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(short, neon, detail::neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned short, neon, detail::neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(int, neon, detail::neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned int, neon, detail::neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(long int, neon, detail::neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, neon, detail::neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(long long int, neon, detail::neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, neon, detail::neon_vector_type); XSIMD_DECLARE_SIMD_REGISTER(float, neon, float32x4_t); XSIMD_DECLARE_INVALID_SIMD_REGISTER(double, neon); namespace detail { template struct neon_bool_simd_register { using type = simd_register, A>; }; } template struct get_bool_simd_register : detail::neon_bool_simd_register { }; } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_register.hpp000066400000000000000000000101711517435117100254030ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_REGISTER_HPP #define XSIMD_REGISTER_HPP #include #include "../config/xsimd_macros.hpp" namespace xsimd { namespace types { template struct has_simd_register : std::false_type { }; template struct simd_register { struct register_type { }; }; #define XSIMD_DECLARE_SIMD_REGISTER(SCALAR_TYPE, ISA, VECTOR_TYPE) \ template <> \ struct simd_register \ { \ using register_type = VECTOR_TYPE; \ register_type data; \ XSIMD_INLINE operator register_type() const noexcept \ { \ return data; \ } \ }; \ template <> \ struct has_simd_register : std::true_type \ { \ } #define XSIMD_DECLARE_INVALID_SIMD_REGISTER(SCALAR_TYPE, ISA) \ template <> \ struct has_simd_register : std::false_type \ { \ } #define XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ISA, ISA_BASE) \ template \ struct simd_register : simd_register \ { \ using register_type = typename simd_register::register_type; \ simd_register(register_type reg) noexcept \ : simd_register { reg } \ { \ } \ simd_register() = default; \ }; \ template \ struct has_simd_register : has_simd_register \ { \ } template struct get_bool_simd_register { using type = simd_register; }; template using get_bool_simd_register_t = typename get_bool_simd_register::type; } namespace kernel { template // makes requires_arch equal to A const&, using type_traits functions using requires_arch = std::add_lvalue_reference_t>; template struct convert { }; } } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_rvv_register.hpp000066400000000000000000000560271517435117100263120ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * Copyright (c) Yibo Cai * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_RVV_REGISTER_HPP #define XSIMD_RVV_REGISTER_HPP #include "../utils/xsimd_type_traits.hpp" #include "./xsimd_common_arch.hpp" #include "./xsimd_register.hpp" #if XSIMD_WITH_RVV #include #endif namespace xsimd { namespace detail { /** * @ingroup architectures * * RVV instructions (fixed vector size) for riscv */ template struct rvv : xsimd::common { static constexpr size_t width = Width; static constexpr bool supported() noexcept { return Width == XSIMD_RVV_BITS; } static constexpr bool available() noexcept { return true; } static constexpr bool requires_alignment() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 16; } static constexpr char const* name() noexcept { return "riscv+rvv"; } }; } #if XSIMD_WITH_RVV using rvv = detail::rvv<__riscv_v_fixed_vlen>; #define XSIMD_RVV_JOINT_(a, b, c) a##b##c #define XSIMD_RVV_JOINT(a, b, c) XSIMD_RVV_JOINT_(a, b, c) #define XSIMD_RVV_JOINT5(a, b, c, d, e) XSIMD_RVV_JOINT(XSIMD_RVV_JOINT(a, b, c), d, e) #define XSIMD_RVV_TYPE_i(S, V) XSIMD_RVV_JOINT5(vint, S, m, V, _t) #define XSIMD_RVV_TYPE_u(S, V) XSIMD_RVV_JOINT5(vuint, S, m, V, _t) #define XSIMD_RVV_TYPE_f(S, V) XSIMD_RVV_JOINT5(vfloat, S, m, V, _t) #define XSIMD_RVV_TYPE(T, S, V) XSIMD_RVV_JOINT(XSIMD_RVV_TYPE, _, T)(S, V) namespace types { namespace detail { static constexpr size_t rvv_width_mf8 = XSIMD_RVV_BITS / 8; static constexpr size_t rvv_width_mf4 = XSIMD_RVV_BITS / 4; static constexpr size_t rvv_width_mf2 = XSIMD_RVV_BITS / 2; static constexpr size_t rvv_width_m1 = XSIMD_RVV_BITS; // Cope with gcc limitation, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116484 #define XSIMD_RVV_WIDTH_MF8 (XSIMD_RVV_BITS / 8) #define XSIMD_RVV_WIDTH_MF4 (XSIMD_RVV_BITS / 4) #define XSIMD_RVV_WIDTH_MF2 (XSIMD_RVV_BITS / 2) #define XSIMD_RVV_WIDTH_M1 XSIMD_RVV_BITS // rvv_type_info is a utility class to convert scalar type and // bitwidth into rvv register types. // // * `type` is the unadorned vector type. // * `fixed_type` is the same type, but with the storage attribute // applied. // * `byte_type` is the type which is the same size in unsigned // bytes, used as an intermediate step for bit-cast operations, // because only a subset of __riscv_vreinterpret() intrinsics // exist -- but always enough to get us to bytes and back. // template struct rvv_type_info; #define XSIMD_RVV_MAKE_TYPE(scalar, t, s, vmul) \ template <> \ struct rvv_type_info \ { \ static constexpr size_t width = rvv_width_m1 * vmul; \ using type = XSIMD_RVV_TYPE(t, s, vmul); \ using byte_type = XSIMD_RVV_TYPE(u, 8, vmul); \ using fixed_type = type __attribute__((riscv_rvv_vector_bits(/*width=*/XSIMD_RVV_WIDTH_M1 * vmul))); \ template \ static XSIMD_INLINE type bitcast(U x) noexcept \ { \ const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \ return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, t, s, m, vmul)(words); \ } \ template \ static XSIMD_INLINE byte_type as_bytes(U x) noexcept \ { \ static_assert(std::is_same::value, "inconsistent conversion types"); \ const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \ return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, 8, m, vmul)(words); \ } \ }; \ template <> \ XSIMD_INLINE XSIMD_RVV_TYPE(t, s, vmul) rvv_type_info::bitcast(XSIMD_RVV_TYPE(t, s, vmul) x) noexcept { return x; } #define XSIMD_RVV_MAKE_TYPES(vmul) \ XSIMD_RVV_MAKE_TYPE(int8_t, i, 8, vmul) \ XSIMD_RVV_MAKE_TYPE(uint8_t, u, 8, vmul) \ XSIMD_RVV_MAKE_TYPE(int16_t, i, 16, vmul) \ XSIMD_RVV_MAKE_TYPE(uint16_t, u, 16, vmul) \ XSIMD_RVV_MAKE_TYPE(int32_t, i, 32, vmul) \ XSIMD_RVV_MAKE_TYPE(uint32_t, u, 32, vmul) \ XSIMD_RVV_MAKE_TYPE(int64_t, i, 64, vmul) \ XSIMD_RVV_MAKE_TYPE(uint64_t, u, 64, vmul) \ XSIMD_RVV_MAKE_TYPE(float, f, 32, vmul) \ XSIMD_RVV_MAKE_TYPE(double, f, 64, vmul) XSIMD_RVV_MAKE_TYPES(8) XSIMD_RVV_MAKE_TYPES(4) XSIMD_RVV_MAKE_TYPES(2) XSIMD_RVV_MAKE_TYPES(1) #undef XSIMD_RVV_TYPE #undef XSIMD_RVV_TYPE_f #undef XSIMD_RVV_TYPE_u #undef XSIMD_RVV_TYPE_i #undef XSIMD_RVV_MAKE_TYPES #undef XSIMD_RVV_MAKE_TYPE // Specialization needed for #1058 template <> XSIMD_INLINE rvv_type_info::type rvv_type_info::bitcast<__rvv_uint8m8_t>( __rvv_uint8m8_t x) noexcept { return __riscv_vreinterpret_i8m8(x); } template <> XSIMD_INLINE rvv_type_info::type rvv_type_info::bitcast<__rvv_uint8m1_t>( __rvv_uint8m1_t x) noexcept { return __riscv_vreinterpret_i8m1(x); } template <> XSIMD_INLINE rvv_type_info::type rvv_type_info::bitcast<__rvv_uint8m1_t>( __rvv_uint8m1_t x) noexcept { return __riscv_vreinterpret_u16m1(x); } template <> XSIMD_INLINE rvv_type_info::type rvv_type_info::bitcast<__rvv_uint8m1_t>( __rvv_uint8m1_t x) noexcept { return __riscv_vreinterpret_u32m1(x); } template <> XSIMD_INLINE rvv_type_info::type rvv_type_info::bitcast<__rvv_uint8m1_t>( __rvv_uint8m1_t x) noexcept { return __riscv_vreinterpret_u64m1(x); } // template <> XSIMD_INLINE rvv_type_info::byte_type rvv_type_info::as_bytes<__rvv_int8m8_t>(__rvv_int8m8_t x) noexcept { return __riscv_vreinterpret_u8m8(x); } template <> XSIMD_INLINE rvv_type_info::byte_type rvv_type_info::as_bytes<__rvv_int8m1_t>(__rvv_int8m1_t x) noexcept { return __riscv_vreinterpret_u8m1(x); } template <> XSIMD_INLINE rvv_type_info::byte_type rvv_type_info::as_bytes<__rvv_uint8m1_t>(__rvv_uint8m1_t x) noexcept { return x; } template <> XSIMD_INLINE rvv_type_info::byte_type rvv_type_info::as_bytes<__rvv_uint16m1_t>(__rvv_uint16m1_t x) noexcept { return __riscv_vreinterpret_u8m1(x); } template <> XSIMD_INLINE rvv_type_info::byte_type rvv_type_info::as_bytes<__rvv_uint32m1_t>(__rvv_uint32m1_t x) noexcept { return __riscv_vreinterpret_u8m1(x); } template <> XSIMD_INLINE rvv_type_info::byte_type rvv_type_info::as_bytes<__rvv_uint64m1_t>(__rvv_uint64m1_t x) noexcept { return __riscv_vreinterpret_u8m1(x); } // rvv_blob is storage-type abstraction for a vector register. template struct rvv_blob : public rvv_type_info { using super = rvv_type_info; using typename super::fixed_type; using typename super::type; fixed_type value; type get() const { return value; } void set(type v) { value = v; } }; template struct semitype; template <> struct semitype<2> { using type = vuint8mf2_t __attribute__((riscv_rvv_vector_bits(XSIMD_RVV_WIDTH_MF2))); }; template <> struct semitype<4> { using type = vuint8mf4_t __attribute__((riscv_rvv_vector_bits(XSIMD_RVV_WIDTH_MF4))); }; template <> struct semitype<8> { using type = vuint8mf8_t __attribute__((riscv_rvv_vector_bits(XSIMD_RVV_WIDTH_MF8))); }; // // But sometimes we want our storage type to be less than a whole // register, while presenting as a whole register to the outside // world. This is because some partial-register types are not // defined, but they can (mostly) be emulated using shorter vl on a // full-width register for arithmetic, and cast back to a partial // byte register for storage. // template struct rvv_semiblob : public rvv_type_info { using super = rvv_type_info; static constexpr size_t width = rvv_width_m1 / divisor; using typename super::type; using fixed_type = typename semitype::type; using super::as_bytes; using super::bitcast; fixed_type value; vuint8m1_t get_bytes(std::integral_constant) const { return __riscv_vlmul_ext_v_u8mf2_u8m1(value); } vuint8m1_t get_bytes(std::integral_constant) const { return __riscv_vlmul_ext_v_u8mf4_u8m1(value); } vuint8m1_t get_bytes(std::integral_constant) const { return __riscv_vlmul_ext_v_u8mf8_u8m1(value); } type get() const noexcept { vuint8m1_t bytes = get_bytes(std::integral_constant()); return bitcast(bytes); } void set_bytes(vuint8m1_t v, std::integral_constant) { value = __riscv_vlmul_trunc_v_u8m1_u8mf2(v); } void set_bytes(vuint8m1_t v, std::integral_constant) { value = __riscv_vlmul_trunc_v_u8m1_u8mf4(v); } void set_bytes(vuint8m1_t v, std::integral_constant) { value = __riscv_vlmul_trunc_v_u8m1_u8mf8(v); } void set(type v) { vuint8m1_t bytes = as_bytes(v); set_bytes(bytes, std::integral_constant()); } }; template struct rvv_blob : rvv_semiblob { }; template struct rvv_blob : rvv_semiblob { }; template struct rvv_blob : rvv_semiblob { }; // An explicit constructor isn't really explicit enough to allow // implicit bit-casting operations between incompatible types, so // we add this vacuous flag argument when we're serious: // enum rvv_bitcast_flag { XSIMD_RVV_BITCAST }; // the general-purpose vector register type, usable within // templates, and supporting arithmetic on partial registers for // which there is no intrinsic type (by casting via a full register // type). // template struct rvv_reg { static constexpr size_t width = Width; static constexpr size_t vl = Width / (sizeof(T) * 8); using blob_type = rvv_blob; using register_type = typename blob_type::type; using byte_type = typename blob_type::byte_type; blob_type value; rvv_reg() noexcept = default; rvv_reg(register_type x) noexcept { value.set(x); } explicit rvv_reg(byte_type v, rvv_bitcast_flag) { value.set(value.bitcast(v)); } template explicit rvv_reg(rvv_reg v, rvv_bitcast_flag) : rvv_reg(v.get_bytes(), XSIMD_RVV_BITCAST) { } byte_type get_bytes() const noexcept { return blob_type::as_bytes(value.get()); } operator register_type() const noexcept { return value.get(); } }; template using rvv_reg_t = std::conditional_t::value, rvv_reg, Width>, void>; // And some more of the same stuff for bool types, which have // similar problems and similar workarounds. // template struct rvv_bool_info; #define XSIMD_RVV_MAKE_BOOL_TYPE(i) \ template <> \ struct rvv_bool_info \ { \ using type = XSIMD_RVV_JOINT(vbool, i, _t); \ template \ static XSIMD_INLINE type bitcast(T value) noexcept \ { \ return XSIMD_RVV_JOINT(__riscv_vreinterpret_b, i, )(value); \ } \ /*template <> static XSIMD_INLINE type bitcast(type value) noexcept { return value; }*/ \ }; XSIMD_RVV_MAKE_BOOL_TYPE(1); XSIMD_RVV_MAKE_BOOL_TYPE(2); XSIMD_RVV_MAKE_BOOL_TYPE(4); XSIMD_RVV_MAKE_BOOL_TYPE(8); XSIMD_RVV_MAKE_BOOL_TYPE(16); XSIMD_RVV_MAKE_BOOL_TYPE(32); XSIMD_RVV_MAKE_BOOL_TYPE(64); #undef XSIMD_RVV_MAKE_BOOL_TYPE #undef XSIMD_RVV_JOINT5 #undef XSIMD_RVV_JOINT #undef XSIMD_RVV_JOINT_ template struct rvv_bool { using bool_info = rvv_bool_info; using storage_type = vuint8m1_t __attribute__((riscv_rvv_vector_bits(XSIMD_RVV_WIDTH_M1))); using type = typename bool_info::type; storage_type value; rvv_bool() = default; rvv_bool(type v) noexcept : value(__riscv_vreinterpret_u8m1(v)) { } template = 0> rvv_bool(rvv_bool v) : value(v.value) { } explicit rvv_bool(uint8_t mask) noexcept : value(__riscv_vmv_v_x_u8m1(mask, rvv_width_m1 / 8)) { } explicit rvv_bool(uint64_t mask) noexcept : value(__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vmv_v_x_u64m1(mask, rvv_width_m1 / 64))) { } operator type() const noexcept { return bool_info::bitcast(value); } type as_mask() const noexcept { return (type) * this; } }; template using rvv_bool_t = std::enable_if_t < !std::is_void::value, rvv_bool, Width>; template struct rvv_vector_type_impl; template <> struct rvv_vector_type_impl<8> { using signed_type = rvv_reg_t; using unsigned_type = rvv_reg_t; using floating_point_type = void; }; template <> struct rvv_vector_type_impl<16> { using signed_type = rvv_reg_t; using unsigned_type = rvv_reg_t; using floating_point_type = rvv_reg_t<_Float16>; }; template <> struct rvv_vector_type_impl<32> { using signed_type = rvv_reg_t; using unsigned_type = rvv_reg_t; using floating_point_type = rvv_reg_t; }; template <> struct rvv_vector_type_impl<64> { using signed_type = rvv_reg_t; using unsigned_type = rvv_reg_t; using floating_point_type = rvv_reg_t; }; template using signed_int_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::signed_type; template using unsigned_int_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::unsigned_type; template using floating_point_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::floating_point_type; template using signed_int_or_floating_point_rvv_vector_type = std::conditional_t::value, floating_point_rvv_vector_type, signed_int_rvv_vector_type>; template using rvv_vector_type = std::conditional_t::value, signed_int_or_floating_point_rvv_vector_type, unsigned_int_rvv_vector_type>; } // namespace detail XSIMD_DECLARE_SIMD_REGISTER(bool, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(signed char, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned char, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(char, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(short, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned short, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(int, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned int, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(long int, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(long long int, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(float, rvv, detail::rvv_vector_type); XSIMD_DECLARE_SIMD_REGISTER(double, rvv, detail::rvv_vector_type); namespace detail { template struct rvv_bool_simd_register { using register_type = rvv_bool_t; register_type data; operator register_type() const noexcept { return data; } }; } // namespace detail template struct get_bool_simd_register { using type = detail::rvv_bool_simd_register; }; } // namespace types #undef XSIMD_RVV_WIDTH_MF8 #undef XSIMD_RVV_WIDTH_MF4 #undef XSIMD_RVV_WIDTH_MF2 #undef XSIMD_RVV_WIDTH_M1 #else using rvv = detail::rvv<0xFFFFFFFF>; #endif } // namespace xsimd #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_sse2_register.hpp000066400000000000000000000045331517435117100263440ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE2_REGISTER_HPP #define XSIMD_SSE2_REGISTER_HPP #include "./xsimd_common_arch.hpp" #include "./xsimd_register.hpp" #if XSIMD_WITH_SSE2 #include #include #endif namespace xsimd { /** * @ingroup architectures * * SSE2 instructions */ struct sse2 : common { static constexpr bool supported() noexcept { return XSIMD_WITH_SSE2; } static constexpr bool available() noexcept { return true; } static constexpr bool requires_alignment() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 16; } static constexpr char const* name() noexcept { return "sse2"; } }; #if XSIMD_WITH_SSE2 namespace types { XSIMD_DECLARE_SIMD_REGISTER(signed char, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(char, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(short, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(int, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(long int, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(long long int, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(float, sse2, __m128); XSIMD_DECLARE_SIMD_REGISTER(double, sse2, __m128d); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_sse3_register.hpp000066400000000000000000000027321517435117100263440ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE3_REGISTER_HPP #define XSIMD_SSE3_REGISTER_HPP #include "./xsimd_sse2_register.hpp" #if XSIMD_WITH_SSE3 #include #endif namespace xsimd { /** * @ingroup architectures * * SSE3 instructions */ struct sse3 : sse2 { static constexpr bool supported() noexcept { return XSIMD_WITH_SSE3; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "sse3"; } }; #if XSIMD_WITH_SSE3 #if !XSIMD_WITH_SSE2 #error "architecture inconsistency: sse3 requires sse2" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse3, sse2); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_sse4_1_register.hpp000066400000000000000000000027621517435117100265700ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE4_1_REGISTER_HPP #define XSIMD_SSE4_1_REGISTER_HPP #include "./xsimd_ssse3_register.hpp" #if XSIMD_WITH_SSE4_1 #include #endif namespace xsimd { /** * @ingroup architectures * * SSE4.1 instructions */ struct sse4_1 : ssse3 { static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_1; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "sse4.1"; } }; #if XSIMD_WITH_SSE4_1 #if !XSIMD_WITH_SSSE3 #error "architecture inconsistency: sse4.1 requires ssse3" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_1, ssse3); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_sse4_2_register.hpp000066400000000000000000000027671517435117100265760ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSE4_2_REGISTER_HPP #define XSIMD_SSE4_2_REGISTER_HPP #include "./xsimd_sse4_1_register.hpp" #if XSIMD_WITH_SSE4_2 #include #endif namespace xsimd { /** * @ingroup architectures * * SSE4.2 instructions */ struct sse4_2 : sse4_1 { static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_2; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "sse4.2"; } }; #if XSIMD_WITH_SSE4_2 #if !XSIMD_WITH_SSE4_1 #error "architecture inconsistency: sse4.2 requires sse4.1" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_2, sse4_1); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_ssse3_register.hpp000066400000000000000000000027431517435117100265310ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SSSE3_REGISTER_HPP #define XSIMD_SSSE3_REGISTER_HPP #include "./xsimd_sse3_register.hpp" #if XSIMD_WITH_SSSE3 #include #endif namespace xsimd { /** * @ingroup architectures * * SSSE3 instructions */ struct ssse3 : sse3 { static constexpr bool supported() noexcept { return XSIMD_WITH_SSSE3; } static constexpr bool available() noexcept { return true; } static constexpr char const* name() noexcept { return "ssse3"; } }; #if XSIMD_WITH_SSSE3 #if !XSIMD_WITH_SSE3 #error "architecture inconsistency: ssse3 requires sse3" #endif namespace types { XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ssse3, sse3); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_sve_register.hpp000066400000000000000000000215321517435117100262630ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * Copyright (c) Yibo Cai * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_SVE_REGISTER_HPP #define XSIMD_SVE_REGISTER_HPP #include "xsimd_common_arch.hpp" #include "xsimd_register.hpp" #if XSIMD_WITH_SVE #include #endif namespace xsimd { namespace detail { /** * @ingroup architectures * * SVE instructions (fixed vector size) for arm64 */ template struct sve : xsimd::common { static constexpr bool supported() noexcept { return Width == XSIMD_SVE_BITS; } static constexpr bool available() noexcept { return true; } static constexpr bool requires_alignment() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 16; } static constexpr char const* name() noexcept { return "arm64+sve"; } }; } #if XSIMD_WITH_SVE using sve = detail::sve<__ARM_FEATURE_SVE_BITS>; namespace types { namespace detail { // define fixed size alias per SVE sizeless type #define SVE_TO_FIXED_SIZE(ty) ty __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))) using sve_int8_t = SVE_TO_FIXED_SIZE(svint8_t); using sve_uint8_t = SVE_TO_FIXED_SIZE(svuint8_t); using sve_int16_t = SVE_TO_FIXED_SIZE(svint16_t); using sve_uint16_t = SVE_TO_FIXED_SIZE(svuint16_t); using sve_int32_t = SVE_TO_FIXED_SIZE(svint32_t); using sve_uint32_t = SVE_TO_FIXED_SIZE(svuint32_t); using sve_int64_t = SVE_TO_FIXED_SIZE(svint64_t); using sve_uint64_t = SVE_TO_FIXED_SIZE(svuint64_t); using sve_float32_t = SVE_TO_FIXED_SIZE(svfloat32_t); using sve_float64_t = SVE_TO_FIXED_SIZE(svfloat64_t); using sve_bool_t = SVE_TO_FIXED_SIZE(svbool_t); #undef SVE_TO_FIXED_SIZE template struct sve_vector_type_impl; template <> struct sve_vector_type_impl<1> { using signed_type = sve_int8_t; using unsigned_type = sve_uint8_t; using floating_point_type = void; using sizeless_unsigned_type = svuint8_t; using sizeless_signed_type = svint8_t; using sizeless_floating_point_type = void; }; template <> struct sve_vector_type_impl<2> { using signed_type = sve_int16_t; using unsigned_type = sve_uint16_t; using floating_point_type = void; using sizeless_unsigned_type = svuint16_t; using sizeless_signed_type = svint16_t; using sizeless_floating_point_type = void; }; template <> struct sve_vector_type_impl<4> { using signed_type = sve_int32_t; using unsigned_type = sve_uint32_t; using floating_point_type = sve_float32_t; using sizeless_unsigned_type = svuint32_t; using sizeless_signed_type = svint32_t; using sizeless_floating_point_type = svfloat32_t; }; template <> struct sve_vector_type_impl<8> { using signed_type = sve_int64_t; using unsigned_type = sve_uint64_t; using floating_point_type = sve_float64_t; using sizeless_unsigned_type = svuint64_t; using sizeless_signed_type = svint64_t; using sizeless_floating_point_type = svfloat64_t; }; template using signed_int_sve_vector_type = typename sve_vector_type_impl::signed_type; template using unsigned_int_sve_vector_type = typename sve_vector_type_impl::unsigned_type; template using floating_point_sve_vector_type = typename sve_vector_type_impl::floating_point_type; template using sizeless_signed_int_sve_vector_type = typename sve_vector_type_impl::sizeless_signed_type; template using sizeless_unsigned_int_sve_vector_type = typename sve_vector_type_impl::sizeless_unsigned_type; template using sizeless_floating_point_sve_vector_type = typename sve_vector_type_impl::sizeless_floating_point_type; template struct sve_vector_impl; template struct sve_vector_impl::value>> { using type = floating_point_sve_vector_type; }; template struct sve_vector_impl::value && std::is_signed::value>> { using type = signed_int_sve_vector_type; }; template struct sve_vector_impl::value && std::is_unsigned::value>> { using type = unsigned_int_sve_vector_type; }; template struct sizeless_sve_vector_impl; template struct sizeless_sve_vector_impl::value>> { using type = sizeless_floating_point_sve_vector_type; }; template struct sizeless_sve_vector_impl::value && std::is_signed::value>> { using type = sizeless_signed_int_sve_vector_type; }; template struct sizeless_sve_vector_impl::value && std::is_unsigned::value>> { using type = sizeless_unsigned_int_sve_vector_type; }; template using sve_vector_type = typename detail::sve_vector_impl::type; template using sizeless_sve_vector_type = typename detail::sizeless_sve_vector_impl::type; } // namespace detail XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(char, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(short, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(int, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(long int, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(long long int, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(float, sve, detail::sve_vector_type); XSIMD_DECLARE_SIMD_REGISTER(double, sve, detail::sve_vector_type); namespace detail { struct sve_bool_simd_register { using register_type = sve_bool_t; register_type data; operator register_type() const noexcept { return data; } }; } // namespace detail template struct get_bool_simd_register { using type = detail::sve_bool_simd_register; }; } // namespace types #else using sve = detail::sve<0xFFFFFFFF>; #endif } // namespace xsimd #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_traits.hpp000066400000000000000000000310511517435117100250650ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_TRAITS_HPP #define XSIMD_TRAITS_HPP #include #include #include #ifdef XSIMD_ENABLE_XTL_COMPLEX #include #endif #include "./xsimd_batch_fwd.hpp" #include "./xsimd_utils.hpp" /** * high level type traits * * @defgroup batch_traits Type traits * **/ namespace xsimd { /************************************** * simd_traits and revert_simd_traits * **************************************/ template struct has_simd_register : types::has_simd_register { }; template struct has_simd_register, A> : has_simd_register { }; namespace detail { template struct simd_traits_impl; template struct simd_traits_impl { using type = T; using bool_type = bool; static constexpr size_t size = 1; }; #if __cplusplus < 201703L template constexpr size_t simd_traits_impl::size; #endif template struct simd_traits_impl { using type = batch; using bool_type = typename type::batch_bool_type; static constexpr size_t size = type::size; }; #if __cplusplus < 201703L template constexpr size_t simd_traits_impl::size; #endif template struct static_check_supported_config_emitter { static_assert(A::supported(), "usage of batch type with unsupported architecture"); static_assert(!A::supported() || xsimd::has_simd_register::value, "usage of batch type with unsupported type"); }; template struct static_check_supported_config_emitter : static_check_supported_config_emitter, A> { }; template struct static_check_supported_config_emitter, A> : static_check_supported_config_emitter { }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct static_check_supported_config_emitter, A> : static_check_supported_config_emitter { }; #endif // consistency checker template XSIMD_INLINE void static_check_supported_config() { (void)static_check_supported_config_emitter(); } } template struct simd_traits : detail::simd_traits_impl::value> { }; template struct simd_traits> : detail::simd_traits_impl, xsimd::has_simd_register::value> { }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct simd_traits> : detail::simd_traits_impl, xsimd::has_simd_register::value> { }; #endif template struct revert_simd_traits { using type = T; static constexpr size_t size = simd_traits::size; }; #if __cplusplus < 201703L template constexpr size_t revert_simd_traits::size; #endif template struct revert_simd_traits> { using type = T; static constexpr size_t size = batch::size; }; #if __cplusplus < 201703L template constexpr size_t revert_simd_traits>::size; #endif template using simd_type = typename simd_traits::type; template using simd_bool_type = typename simd_traits::bool_type; template using revert_simd_type = typename revert_simd_traits::type; /******************** * simd_return_type * ********************/ namespace detail { template struct simd_condition { static constexpr bool value = (std::is_same::value && !std::is_same::value) || (std::is_same::value && !std::is_same::value) || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || detail::is_complex::value; }; template struct simd_return_type_impl : std::enable_if::value, batch> { }; template struct simd_return_type_impl : std::enable_if::value, batch_bool> { }; template struct simd_return_type_impl, A> : std::enable_if::value, batch_bool> { }; template struct simd_return_type_impl, T2, A> : std::enable_if::value, batch, A>> { }; template struct simd_return_type_impl, std::complex, A> : std::enable_if::value, batch, A>> { }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct simd_return_type_impl, T2, A> : std::enable_if::value, batch, A>> { }; template struct simd_return_type_impl, std::complex, A> : std::enable_if::value, batch, A>> { }; template struct simd_return_type_impl, xtl::xcomplex, A> : std::enable_if::value, batch, A>> { }; template struct simd_return_type_impl, xtl::xcomplex, A> : std::enable_if::value, batch, A>> { }; #endif } template using simd_return_type = typename detail::simd_return_type_impl::type; /** * @ingroup batch_traits * * type traits that provide information about a batch or scalar type. * * @tparam T type to analyze. */ template struct batch_traits { using scalar_type = T; ///< T if scalar, or type of the scalar element for the batch T. using mask_type = bool; ///< Mask type for T: bool for scalars, or batch_bool for batch types. static constexpr bool is_batch = false; ///< True if T is @c batch<...>. static constexpr bool is_batch_bool = false; ///< True if T is @c batch_bool<...>. static constexpr bool is_any_batch = false; ///< True if T is @c batch<...> or @c batch_bool<...>. static constexpr bool is_complex = detail::is_complex::value; ///< True if T is complex or a batch of complex values. }; #if __cplusplus < 201703L template constexpr bool batch_traits::is_batch; template constexpr bool batch_traits::is_batch_bool; template constexpr bool batch_traits::is_any_batch; template constexpr bool batch_traits::is_complex; #endif template struct batch_traits> { using scalar_type = T; using mask_type = typename batch::batch_bool_type; static constexpr bool is_batch = true; static constexpr bool is_batch_bool = false; static constexpr bool is_any_batch = true; static constexpr bool is_complex = detail::is_complex::value; }; #if __cplusplus < 201703L template constexpr bool batch_traits>::is_batch; template constexpr bool batch_traits>::is_batch_bool; template constexpr bool batch_traits>::is_any_batch; template constexpr bool batch_traits>::is_complex; #endif template struct batch_traits> { using scalar_type = bool; using mask_type = batch_bool; static constexpr bool is_batch = false; static constexpr bool is_batch_bool = true; static constexpr bool is_any_batch = true; static constexpr bool is_complex = false; }; #if __cplusplus < 201703L template constexpr bool batch_traits>::is_batch; template constexpr bool batch_traits>::is_batch_bool; template constexpr bool batch_traits>::is_any_batch; template constexpr bool batch_traits>::is_complex; #endif /** * @ingroup batch_traits * * type traits that inherits from @c std::true_type for @c batch<...> types and from * @c std::false_type otherwise. * * @tparam T type to analyze. */ template struct is_batch : std::integral_constant::is_batch> { }; /** * @ingroup batch_traits * * type traits that inherits from @c std::true_type for @c batch_bool<...> types and from * @c std::false_type otherwise. * * @tparam T type to analyze. */ template struct is_batch_bool : std::integral_constant::is_batch_bool> { }; /** * @ingroup batch_traits * * type traits that inherits from @c std::true_type for @c batch<...> or batch_bool<...> * types and from @c std::false_type otherwise. * * @tparam T type to analyze. */ template struct is_any_batch : std::integral_constant::is_any_batch> { }; /** * @ingroup batch_traits * * type traits that inherits from @c std::true_type for @c batch> * types and from @c std::false_type otherwise. * * @tparam T type to analyze. */ template struct is_batch_complex : std::integral_constant::is_batch && batch_traits::is_complex> { }; /** * @ingroup batch_traits * * type traits whose @c type field is set to @c T::value_type if @c * is_batch::value and to @c T otherwise. * * @tparam T type to analyze. */ template struct scalar_type { using type = typename batch_traits::scalar_type; }; template using scalar_type_t = typename scalar_type::type; /** * @ingroup batch_traits * * type traits whose @c type field is set to @c T::value_type if @c * is_batch_bool::value and to @c bool otherwise. * * @tparam T type to analyze. */ template struct mask_type { using type = typename batch_traits::mask_type; }; template using mask_type_t = typename mask_type::type; } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_utils.hpp000066400000000000000000000317601517435117100247260ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_UTILS_HPP #define XSIMD_UTILS_HPP #include #include #include #include #include #include #ifdef XSIMD_ENABLE_XTL_COMPLEX #include "xtl/xcomplex.hpp" #endif #include "./xsimd_batch_fwd.hpp" namespace xsimd { /************** * index * **************/ template using index = std::integral_constant; /************** * as_integer * **************/ template struct as_integer : std::make_signed { }; template <> struct as_integer { using type = int32_t; }; template <> struct as_integer { using type = int64_t; }; template struct as_integer> { using type = batch::type, A>; }; template using as_integer_t = typename as_integer::type; /*********************** * as_unsigned_integer * ***********************/ template struct as_unsigned_integer : std::make_unsigned { }; template <> struct as_unsigned_integer { using type = uint8_t; }; template <> struct as_unsigned_integer { using type = uint32_t; }; template <> struct as_unsigned_integer { using type = uint64_t; }; template struct as_unsigned_integer> { using type = batch::type, A>; }; template using as_unsigned_integer_t = typename as_unsigned_integer::type; /********************* * as_signed_integer * *********************/ template struct as_signed_integer : std::make_signed { }; template using as_signed_integer_t = typename as_signed_integer::type; /****************** * flip_sign_type * ******************/ namespace detail { template struct flipped_sign_type_impl : std::make_signed { }; template struct flipped_sign_type_impl : std::make_unsigned { }; } template struct flipped_sign_type : detail::flipped_sign_type_impl::value> { }; template using flipped_sign_type_t = typename flipped_sign_type::type; /*********** * as_float * ************/ template struct as_float; template <> struct as_float { using type = float; }; template <> struct as_float { using type = double; }; template struct as_float> { using type = batch::type, A>; }; template using as_float_t = typename as_float::type; /************** * as_logical * **************/ template struct as_logical; template struct as_logical> { using type = batch_bool; }; template using as_logical_t = typename as_logical::type; /******************** * bit_cast * ********************/ template inline To bit_cast(From val) noexcept { static_assert(sizeof(From) == sizeof(To), "casting between compatible layout"); // FIXME: Some old version of GCC don't support that trait // static_assert(std::is_trivially_copyable::value, "input type is trivially copyable"); // static_assert(std::is_trivially_copyable::value, "output type is trivially copyable"); To res; std::memcpy(&res, &val, sizeof(val)); return res; } namespace kernel { namespace detail { /************************************** * enabling / disabling metafunctions * **************************************/ template using enable_arithmetic_t = std::enable_if_t::value, int>; /// Enable signed integral or floating point template using enable_signed_numeral_t = std::enable_if_t::value, int>; template using enable_floating_point_t = std::enable_if_t::value, int>; template using enable_integral_t = std::enable_if_t::value, int>; template using enable_signed_integral_t = std::enable_if_t::value && std::is_signed::value, int>; template using enable_unsigned_integral_t = std::enable_if_t::value && std::is_unsigned::value, int>; template using enable_sized_signed_t = std::enable_if_t::value && std::is_signed::value && sizeof(T) == S, int>; template using enable_sized_unsigned_t = std::enable_if_t::value && !std::is_signed::value && sizeof(T) == S, int>; template using enable_sized_integral_t = std::enable_if_t::value && sizeof(T) == S, int>; template using enable_sized_t = std::enable_if_t; template using enable_max_sized_integral_t = std::enable_if_t::value && sizeof(T) <= S, int>; /******************************** * Matching & mismatching sizes * ********************************/ template using sizes_match_t = std::enable_if_t; template using sizes_mismatch_t = std::enable_if_t; template using stride_match_t = std::enable_if_t::value && sizeof(T) == sizeof(U), B>; } // namespace detail } // namespace kernel /***************************************** * Backport of index_sequence from c++14 * *****************************************/ // TODO: Remove this once we drop C++11 support namespace detail { template struct identity { using type = T; }; template using int_sequence = std::integer_sequence; template using make_int_sequence = std::make_integer_sequence; template using int_sequence_for = make_int_sequence<(int)sizeof...(Ts)>; // Type-casted index sequence. template inline P indexes_from(std::index_sequence) noexcept { return { static_cast(Is)... }; } template inline P make_sequence_as_batch() noexcept { return indexes_from

(std::make_index_sequence()); } } /********************************* * Backport of void_t from C++17 * *********************************/ namespace detail { template struct make_void { using type = void; }; template using void_t = typename make_void::type; } /************************************************** * Equivalent of void_t but with size_t parameter * **************************************************/ namespace detail { template struct check_size { using type = void; }; template using check_size_t = typename check_size::type; } /***************************************** * Supplementary std::array constructors * *****************************************/ namespace detail { // std::array constructor from scalar value ("broadcast") template inline constexpr std::array array_from_scalar_impl(const T& scalar, std::index_sequence) noexcept { // You can safely ignore this silly ternary, the "scalar" is all // that matters. The rest is just a dirty workaround... return std::array { (Is + 1) ? scalar : T()... }; } template inline constexpr std::array array_from_scalar(const T& scalar) noexcept { return array_from_scalar_impl(scalar, std::make_index_sequence()); } // std::array constructor from C-style pointer (handled as an array) template inline constexpr std::array array_from_pointer_impl(const T* c_array, std::index_sequence) noexcept { return std::array { c_array[Is]... }; } template inline constexpr std::array array_from_pointer(const T* c_array) noexcept { return array_from_pointer_impl(c_array, std::make_index_sequence()); } } /************************ * is_array_initializer * ************************/ namespace detail { template struct bool_pack; template using all_true = std::is_same< bool_pack, bool_pack>; template using is_all_convertible = all_true::value...>; template using is_array_initializer = std::enable_if< (sizeof...(Args) == N) && is_all_convertible::value>; // Check that a variadic argument pack is a list of N values of type T, // as usable for instantiating a value of type std::array. template using is_array_initializer_t = typename is_array_initializer::type; } /************** * is_complex * **************/ // This is used in both xsimd_complex_base.hpp and xsimd_traits.hpp // However xsimd_traits.hpp indirectly includes xsimd_complex_base.hpp // so we cannot define is_complex in xsimd_traits.hpp. Besides, if // no file defining batches is included, we still need this definition // in xsimd_traits.hpp, so let's define it here. namespace detail { template struct is_complex : std::false_type { }; template struct is_complex> : std::true_type { }; #ifdef XSIMD_ENABLE_XTL_COMPLEX template struct is_complex> : std::true_type { }; #endif } /******************* * real_batch_type * *******************/ template struct real_batch_type { using type = B; }; template struct real_batch_type, A>> { using type = batch; }; template using real_batch_type_t = typename real_batch_type::type; /********************** * complex_batch_type * **********************/ template struct complex_batch_type { using real_value_type = typename B::value_type; using arch_type = typename B::arch_type; using type = batch, arch_type>; }; template struct complex_batch_type, A>> { using type = batch, A>; }; template using complex_batch_type_t = typename complex_batch_type::type; } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_vsx_register.hpp000066400000000000000000000065721517435117100263150ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_VSX_REGISTER_HPP #define XSIMD_VSX_REGISTER_HPP #include "./xsimd_common_arch.hpp" #include "./xsimd_register.hpp" #if XSIMD_WITH_VSX #include #endif namespace xsimd { /** * @ingroup architectures * * VSX instructions */ struct vsx : common { static constexpr bool supported() noexcept { return XSIMD_WITH_VSX; } static constexpr bool available() noexcept { return true; } static constexpr bool requires_alignment() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 16; } static constexpr char const* name() noexcept { return "vmx+vsx"; } }; #if XSIMD_WITH_VSX namespace types { #define XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(T, Tb) \ template <> \ struct get_bool_simd_register \ { \ struct type \ { \ using register_type = __vector __bool Tb; \ register_type data; \ type() = default; \ type(register_type r) \ : data(r) \ { \ } \ operator register_type() const noexcept { return data; } \ }; \ }; \ XSIMD_DECLARE_SIMD_REGISTER(T, vsx, __vector T) XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(signed char, char); XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned char, char); XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(char, char); XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned short, short); XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(short, short); XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned int, int); XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(int, int); XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned long, long); XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(long, long); XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(float, int); XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(double, long); #undef XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_vxe_register.hpp000066400000000000000000000077011517435117100262720ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Andreas Krebbel * * Based on xsimd_vsx_register.hpp * * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_VXE_REGISTER_HPP #define XSIMD_VXE_REGISTER_HPP #include "./xsimd_common_arch.hpp" #include "./xsimd_register.hpp" #if XSIMD_WITH_VXE #include #endif namespace xsimd { /** * @ingroup architectures * * VXE instructions */ struct vxe : common { static constexpr bool supported() noexcept { return XSIMD_WITH_VXE; } static constexpr bool available() noexcept { return true; } static constexpr bool requires_alignment() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 16; } static constexpr char const* name() noexcept { return "vxe"; } }; #if XSIMD_WITH_VXE namespace types { #define XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(T, Tv, Tb) \ template <> \ struct get_bool_simd_register \ { \ struct type \ { \ using register_type = __vector __bool Tb; \ register_type data; \ type() = default; \ type(register_type r) \ : data(r) \ { \ } \ operator register_type() const noexcept { return data; } \ }; \ }; \ XSIMD_DECLARE_SIMD_REGISTER(T, vxe, __vector Tv) // The VXE vector intrinsics do not support long, unsigned long, // and char data types. batches of these types are vectors of // equivalent types. XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(signed char, signed char, char); XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned char, unsigned char, char); #ifdef __CHAR_UNSIGNED__ XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(char, unsigned char, char); #else XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(char, signed char, char); #endif XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned short, unsigned short, short); XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(short, short, short); XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned int, unsigned int, int); XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(int, int, int); XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned long, unsigned long long, long long); XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(long, long long, long long); XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(float, float, int); XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(double, double, long long); #undef XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/types/xsimd_wasm_register.hpp000066400000000000000000000046051517435117100264370ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * Copyright (c) Anutosh Bhat * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_WASM_REGISTER_HPP #define XSIMD_WASM_REGISTER_HPP #include "xsimd_common_arch.hpp" #include "xsimd_register.hpp" #if XSIMD_WITH_WASM #include #endif namespace xsimd { /** * @ingroup architectures * * WASM instructions */ struct wasm : common { static constexpr bool supported() noexcept { return XSIMD_WITH_WASM; } static constexpr bool available() noexcept { return true; } static constexpr bool requires_alignment() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 16; } static constexpr char const* name() noexcept { return "wasm"; } }; #if XSIMD_WITH_WASM namespace types { XSIMD_DECLARE_SIMD_REGISTER(signed char, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(unsigned char, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(char, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(unsigned short, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(short, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(unsigned int, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(int, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(long int, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(long long int, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(float, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(double, wasm, v128_t); } #endif } #endif xtensor-stack-xsimd-541558d/include/xsimd/utils/000077500000000000000000000000001517435117100216365ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/include/xsimd/utils/bits.hpp000066400000000000000000000112621517435117100233120ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ***************************************************************************/ #ifndef XSIMD_CPUID_UTILS_HPP #define XSIMD_CPUID_UTILS_HPP #include #include namespace xsimd { namespace utils { template constexpr I make_bit_mask(I bit) { static_assert(std::is_unsigned::value, "Bit operations must be done on unsigned integers"); assert(bit < static_cast(8 * sizeof(I))); return static_cast(I { 1 } << bit); } template constexpr I make_bit_mask(I bit, Args... bits) { // TODO(C++17): Use fold expression static_assert(std::is_unsigned::value, "Bit operations must be done on unsigned integers"); return make_bit_mask(bit) | make_bit_mask(static_cast(bits)...); } template constexpr bool all_bits_set(I value) { static_assert(std::is_unsigned::value, "Bit operations must be done on unsigned integers"); constexpr I mask = make_bit_mask(static_cast(Bits)...); return (value & mask) == mask; } template constexpr I set_bit(I value) { static_assert(std::is_unsigned::value, "Bit operations must be done on unsigned integers"); constexpr I mask = make_bit_mask(static_cast(Bit)); return value | mask; } /** * Return a mask with the `width` lowest bits set. */ template constexpr I make_low_mask(I width) noexcept { static_assert(std::is_unsigned::value, "Bit operations must be done on unsigned integers"); assert(width <= static_cast(8 * sizeof(I))); if (width == static_cast(8 * sizeof(I))) { return ~I { 0 }; } return (I { 1 } << width) - I { 1 }; } /* A bitset over an unsigned integer type, indexed by an enum key type. */ template struct uint_bitset { /* The underlying unsigned integer type storing the bits. */ using storage_type = U; /* The enum type whose values name individual bits. */ using key_type = K; /* Construct from a raw bit pattern. */ constexpr explicit uint_bitset(storage_type bitset = {}) noexcept : m_bitset(bitset) { } /* Return true if every bit named by the template arguments is set. */ template constexpr bool all_bits_set() const noexcept { return utils::all_bits_set(bits)...>(m_bitset); } /* Return true if the bit is set. */ template constexpr bool bit_is_set() const noexcept { return all_bits_set(); } /* Set the corresponding bit to true in the bitfield. */ template constexpr void set_bit() noexcept { m_bitset = utils::set_bit(bit)>(m_bitset); } /* Extract the bits in [start, end[, shifted down to start at bit 0. */ template constexpr storage_type get_range() const noexcept { constexpr storage_type start_bit = static_cast(start); constexpr storage_type end_bit = static_cast(end); constexpr storage_type width = end_bit - start_bit; constexpr storage_type mask = make_low_mask(width); return (m_bitset >> start_bit) & mask; } private: storage_type m_bitset = { 0 }; }; } } #endif xtensor-stack-xsimd-541558d/include/xsimd/utils/xsimd_type_traits.hpp000066400000000000000000000111741517435117100261260ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_TYPE_TRAITS_HPP #define XSIMD_TYPE_TRAITS_HPP #include #include #include namespace xsimd { namespace detail { template struct sized_num_types; template <> struct sized_num_types<1> { using signed_type = std::int8_t; using unsigned_type = std::uint8_t; using floating_point_type = void; }; template <> struct sized_num_types<2> { using signed_type = std::int16_t; using unsigned_type = std::uint16_t; using floating_point_type = void; }; template <> struct sized_num_types<4> { using signed_type = std::int32_t; using unsigned_type = std::uint32_t; using floating_point_type = float; }; template <> struct sized_num_types<8> { using signed_type = std::int64_t; using unsigned_type = std::uint64_t; using floating_point_type = double; }; } /** * @ingroup type_traits * * Signed integer type with exactly @c S bytes (1, 2, 4, or 8). * * @tparam S size in bytes. */ template using sized_int_t = typename detail::sized_num_types::signed_type; /** * @ingroup type_traits * * Unsigned integer type with exactly @c S bytes (1, 2, 4, or 8). * * @tparam S size in bytes. */ template using sized_uint_t = typename detail::sized_num_types::unsigned_type; /** * @ingroup type_traits * * Floating-point type with exactly @c S bytes (4 for @c float, 8 for @c double). * Yields @c void for sizes without a standard floating-point type (1, 2). * * @tparam S size in bytes. */ template using sized_fp_t = typename detail::sized_num_types::floating_point_type; namespace detail { template struct remap_num { using type = T; }; template struct remap_num::value>> { using type = xsimd::sized_fp_t; }; template struct remap_num::value && std::is_signed::value>> { using type = xsimd::sized_int_t; }; template struct remap_num::value && std::is_unsigned::value>> { using type = xsimd::sized_uint_t; }; } /** * @ingroup type_traits * * Remap numeral types to their fixed sized variant (``[u]int{8,16,32}_t`` * and pass through other types). * Certain platforms have different types (*i.e.* not aliases) between * ``char`` and ``int8_t``, or ``long long`` and ``int{32,64}_t``, with SIMD * intrinsicts only defined for some of them. * Handling them requires to cast to a known predictable type. * * @tparam T arithmetic type to project from. */ template using map_to_sized_type_t = typename detail::remap_num::type; /** * @ingroup type_traits * * The next-wider arithmetic type for @c T: doubles the size while preserving * signedness for integers and yielding @c double for @c float. * Supported input types: @c [u]int{8,16,32}_t and @c float. * * @tparam T arithmetic type to widen. */ template using widen_t = typename detail::remap_num::type; } #endif xtensor-stack-xsimd-541558d/include/xsimd/xsimd.hpp000066400000000000000000000032011517435117100223270ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_HPP #define XSIMD_HPP #include "config/xsimd_config.hpp" #include "config/xsimd_macros.hpp" #include "arch/xsimd_scalar.hpp" #include "memory/xsimd_aligned_allocator.hpp" #include "types/xsimd_batch_fwd.hpp" #if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE) namespace xsimd { // no type definition or anything apart from scalar definition and aligned allocator template class batch { static constexpr bool supported_architecture = sizeof(A*) == 0; // type-dependant but always false static_assert(supported_architecture, "No SIMD architecture detected, cannot instantiate a batch"); }; } #else #include "types/xsimd_batch.hpp" #include "types/xsimd_batch_constant.hpp" #include "types/xsimd_traits.hpp" // This include must come last #include "types/xsimd_api.hpp" #endif // XSIMD_NO_SUPPORTED_ARCHITECTURE #endif xtensor-stack-xsimd-541558d/install_sde.sh000066400000000000000000000012361517435117100205660ustar00rootroot00000000000000#git clone https://github.com/marehr/intel-sde-downloader #cd intel-sde-downloader #pip install -r requirements.txt #python ./intel-sde-downloader.py sde-external-8.35.0-2019-03-11-lin.tar.bz2 #wget http://software.intel.com/content/dam/develop/external/us/en/protected/sde-external-8.50.0-2020-03-26-lin.tar.bz2 wget --user-agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36" https://github.com/xtensor-stack/xsimd-testing-resources/releases/download/2.0.0/sde-external-9.48.0-2024-11-25-lin.tar.xz tar xvf sde-external-9.48.0-2024-11-25-lin.tar.xz sudo sh -c "echo 0 > /proc/sys/kernel/yama/ptrace_scope" xtensor-stack-xsimd-541558d/readthedocs.yml000066400000000000000000000003171517435117100207400ustar00rootroot00000000000000version: 2 build: os: "ubuntu-22.04" tools: python: "mambaforge-22.9" sphinx: # Path to Sphinx configuration file configuration: docs/source/conf.py conda: environment: docs/environment.yml xtensor-stack-xsimd-541558d/test/000077500000000000000000000000001517435117100167065ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/test/CMakeLists.txt000066400000000000000000000226151517435117100214540ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # Copyright (c) Serge Guelton # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.13) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-test) enable_testing() find_package(xsimd REQUIRED CONFIG) endif () if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting tests build type to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) else() message(STATUS "Tests build type is ${CMAKE_BUILD_TYPE}") endif() include(CheckCXXCompilerFlag) string(TOUPPER "${CMAKE_BUILD_TYPE}" U_CMAKE_BUILD_TYPE) OPTION(XSIMD_ENABLE_WERROR "Turn on -Werror" OFF) ################ # ARM SETTINGS # ################ OPTION(CROSS_COMPILE_ARM "cross compile for ARM targets" OFF) # Note: to compile on ARM (or cross compile), you may need to add the following: # -DTARGET_ARCH="armv8-a -mfpu=neon -mfloat-abi=softfp -target arm-linux-gnueabi" set(TARGET_ARCH "native" CACHE STRING "Target architecture arguments") string(REGEX MATCH "emulated\\<[0-9]+\\>" TARGET_EMULATED ${TARGET_ARCH}) if (TARGET_EMULATED) message(STATUS "Using emulated target: ${TARGET_EMULATED}") set(EMULATED_COMPILE_FLAGS -DXSIMD_DEFAULT_ARCH=${TARGET_ARCH};-DXSIMD_WITH_EMULATED=1) unset(TARGET_ARCH CACHE) endif() if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") if (NOT WIN32 AND NOT ANDROID) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wunused-parameter -Wextra -Wreorder") if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wshorten-64-to-32") endif() # Users may override the c++ standard: if(NOT DEFINED CMAKE_CXX_STANDARD OR "${CMAKE_CXX_STANDARD}" STREQUAL "") if (ENABLE_XTL_COMPLEX) CHECK_CXX_COMPILER_FLAG("-std=c++17" HAS_CPP17_FLAG) if (NOT HAS_CPP17_FLAG) message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++17 support when xtl complex support is enabled") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") else() CHECK_CXX_COMPILER_FLAG("-std=c++14" HAS_CPP14_FLAG) if (NOT HAS_CPP14_FLAG) message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++14 support!") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") endif() endif() endif() if (NOT CROSS_COMPILE_ARM) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fPIC") endif() endif() if (ANDROID) # Nothing to do here, we assume the cmake Android NDK toolchain sets the # correct options for arm and neon. elseif (CROSS_COMPILE_ARM) # We're cross-compiling with clang++ on Azure Pipelines, this is all pretty specific and just for testing set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS) set(CMAKE_SHARED_LIBRARY_LINK_CXX_FLAGS) set(CMAKE_THREAD_LIBS_INIT) set(CMAKE_SYSTEM_PROCESSOR arm) set(CMAKE_C_COMPILER_TARGET arm-linux-gnueabi) set(CMAKE_CXX_COMPILER_TARGET arm-linux-gnueabi) set(ARM_ARCH_DIRECTORY "arm-linux-gnueabi" CACHE STRING "ARM arch header dir") set(ARM_GCC_VER "4.7.3" CACHE STRING "ARM GCC header dir") include_directories(/usr/${ARM_ARCH_DIRECTORY}/include/c++/${ARM_GCC_VER}/${ARM_ARCH_DIRECTORY}/) include_directories(/usr/${ARM_ARCH_DIRECTORY}/include/c++/${ARM_GCC_VER}/) include_directories(/usr/${ARM_ARCH_DIRECTORY}/include/) if(NOT CMAKE_CXX_FLAGS MATCHES "-march") message(STATUS "SETTING ARCH TO ${TARGET_ARCH}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}") endif() if(ARM_ARCH_DIRECTORY MATCHES "arm-linux-gnueabi") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -mfloat-abi=softfp -target arm-linux-gnueabi") else () # delegating to gcc here endif() message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") message(STATUS "CMAKE_CXX_LINK_EXECUTABLE: ${CMAKE_CXX_LINK_EXECUTABLE}") elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^ppc64" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=${TARGET_ARCH} -mtune=${TARGET_ARCH}") elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") # Nothing specific elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc") # Nothing specific elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector") elseif(NOT WIN32 AND NOT EMSCRIPTEN) if(TARGET_ARCH AND NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}") endif() endif() endif() if(CMAKE_CXX_COMPILER_ID MATCHES MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /MP /bigobj") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267 /wd4005 /wd4146 /wd4800") set(CMAKE_EXE_LINKER_FLAGS /MANIFEST:NO) endif() if(CMAKE_CXX_COMPILER_ID MATCHES Clang AND MSVC AND WIN32) # We are using clang-cl add_compile_options(/EHsc /bigobj) set(CMAKE_EXE_LINKER_FLAGS /MANIFEST:NO) endif() set(XSIMD_TESTS main.cpp test_api.cpp test_arch.cpp test_basic_math.cpp test_batch.cpp test_batch_bool.cpp test_batch_cast.cpp test_batch_complex.cpp test_batch_float.cpp test_batch_int.cpp test_bit.cpp test_bitwise_cast.cpp test_batch_constant.cpp test_batch_manip.cpp test_complex_exponential.cpp test_complex_hyperbolic.cpp test_complex_power.cpp test_complex_trigonometric.cpp test_conversion.cpp test_cpu_features.cpp test_custom_default_arch.cpp test_error_gamma.cpp test_explicit_batch_instantiation.cpp test_exponential.cpp test_extract_pair.cpp test_fp_manipulation.cpp test_hyperbolic.cpp test_load_store.cpp test_memory.cpp test_poly_evaluation.cpp test_power.cpp test_rounding.cpp test_select.cpp test_shuffle.cpp test_sum.cpp test_traits.cpp test_trigonometric.cpp test_utils_bits.cpp test_xsimd_api.cpp test_utils.hpp ) if(NOT MSVC) list(APPEND XSIMD_TESTS test_gnu_source.cpp) endif() add_executable(test_xsimd ${XSIMD_TESTS}) target_link_libraries(test_xsimd PRIVATE xsimd) option(DOWNLOAD_DOCTEST OFF) find_package(doctest QUIET) if (doctest_FOUND) set(DOCTEST_MINIMAL_VERSION 2.4.9) if (doctest_VERSION VERSION_LESS DOCTEST_MINIMAL_VERSION) message(FATAL_ERROR "Requires doctest >= ${DOCTEST_MINIMAL_VERSION}") endif() target_link_libraries(test_xsimd PRIVATE doctest::doctest) elseif(DOWNLOAD_DOCTEST) file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/doctest") file(DOWNLOAD "https://github.com/doctest/doctest/releases/download/v2.4.9/doctest.h" "${CMAKE_CURRENT_BINARY_DIR}/doctest/doctest.h" STATUS DOWNLOAD_DOCTEST_STATUS) list(GET DOWNLOAD_DOCTEST_STATUS 0 DOWNLOAD_DOCTEST_STATUS_CODE) list(GET DOWNLOAD_DOCTEST_STATUS 1 DOWNLOAD_DOCTEST_ERROR_MESSAGE) if(${DOWNLOAD_DOCTEST_STATUS_CODE} EQUAL 0) message(STATUS "Successfully downloaded doctest.h") else() message(FATAL_ERROR "Error occurred during download of doctest: ${DOWNLOAD_DOCTEST_ERROR_MESSAGE}") endif() target_include_directories(test_xsimd PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) else() message(FATAL_ERROR " Cannot find required doctest component. Please either set CMAKE_PREFIX_PATH to the location of doctestConfig.cmake, or set DOWNLOAD_DOCTEST=ON") endif() if(ENABLE_XTL_COMPLEX) add_compile_definitions(XSIMD_ENABLE_XTL_COMPLEX=1) target_include_directories(test_xsimd PRIVATE ${xtl_INCLUDE_DIRS}) endif() add_test(NAME test_xsimd COMMAND test_xsimd) if (CROSS_COMPILE_ARM) add_custom_target(xtest COMMAND qemu-arm -L /usr/arm-linux-gnueabi/ test_xsimd DEPENDS test_xsimd) else() add_custom_target(xtest COMMAND test_xsimd DEPENDS test_xsimd) endif() if (XSIMD_ENABLE_WERROR) target_compile_options(test_xsimd PRIVATE -Werror -Wall -DXSIMD_SKIP_ON_WERROR) endif() add_subdirectory(doc) add_subdirectory(architectures) if(EMULATED_COMPILE_FLAGS) message(STATUS ${EMULATED_COMPILE_FLAGS}) target_compile_options(test_xsimd PRIVATE ${EMULATED_COMPILE_FLAGS}) endif() if(EMSCRIPTEN) set_target_properties(test_xsimd PROPERTIES LINK_FLAGS "-s MODULARIZE=1 -s EXPORT_NAME=test_xsimd_wasm -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -lembind") target_compile_options(test_xsimd PUBLIC --std=c++17 PUBLIC "SHELL: -msimd128" PUBLIC "SHELL: -msse2" ) endif() xtensor-stack-xsimd-541558d/test/architectures/000077500000000000000000000000001517435117100215535ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/test/architectures/CMakeLists.txt000066400000000000000000000022541517435117100243160ustar00rootroot00000000000000set(INTEL_PROCESSORS bdver1 knl knm skylake-avx512 cannonlake icelake-client icelake-server cascadelake cooperlake tigerlake sapphirerapids alderlake rocketlake graniterapids graniterapids-d znver4) if(NOT TARGET xsimd) find_package(xsimd REQUIRED CONFIG) endif() foreach(INTEL_PROCESSOR ${INTEL_PROCESSORS}) # Adding the werror here to choke if the -march is incompatible with the # native one. check_cxx_compiler_flag(-march=${INTEL_PROCESSOR} FLAG_SUPPORTED_${INTEL_PROCESSOR}) if(FLAG_SUPPORTED_${INTEL_PROCESSOR}) message(STATUS ${INTEL_PROCESSOR}) add_library(test_${INTEL_PROCESSOR} OBJECT dummy.cpp) target_compile_options(test_${INTEL_PROCESSOR} PRIVATE -march=${INTEL_PROCESSOR}) target_link_libraries(test_${INTEL_PROCESSOR} PRIVATE xsimd) add_dependencies(xtest test_${INTEL_PROCESSOR}) if(ENABLE_XTL_COMPLEX) target_compile_features(test_${INTEL_PROCESSOR} PRIVATE cxx_std_14) target_compile_definitions(test_${INTEL_PROCESSOR} PRIVATE XSIMD_ENABLE_XTL_COMPLEX=1) target_link_libraries(test_${INTEL_PROCESSOR} PRIVATE xtl) endif() endif() endforeach() xtensor-stack-xsimd-541558d/test/architectures/dummy.cpp000066400000000000000000000003111517435117100234050ustar00rootroot00000000000000#include // Basic check: can we instantiate a batch for the given compiler flags? xsimd::batch come_and_get_some(xsimd::batch x, xsimd::batch y) { return x + y; } xtensor-stack-xsimd-541558d/test/avx.sh000066400000000000000000000065111517435117100200430ustar00rootroot00000000000000#!/bin/sh exit_code=0 for instr in `grep -o -E '^_mm256_[a-z1Z0-9_]+' $0` do if ! grep -q -r $instr ../include-refactoring then echo $instr exit_code=1 fi done exit $exit_code # Instructions below starting with a # are known to be unused in xsimd _mm256_add_pd _mm256_add_ps #_mm256_addsub_pd #_mm256_addsub_ps _mm256_and_pd _mm256_and_ps _mm256_andnot_pd _mm256_andnot_ps _mm256_blend_pd _mm256_blend_ps _mm256_blendv_pd _mm256_blendv_ps #_mm256_broadcast_pd #_mm256_broadcast_ps #_mm256_broadcast_sd #_mm256_broadcast_ss _mm256_castpd_ps _mm256_castpd_si256 #_mm256_castpd128_pd256 #_mm256_castpd256_pd128 _mm256_castps_pd _mm256_castps_si256 #_mm256_castps128_ps256 #_mm256_castps256_ps128 _mm256_castsi128_si256 _mm256_castsi256_pd _mm256_castsi256_ps _mm256_castsi256_si128 _mm256_ceil_pd _mm256_ceil_ps _mm256_cmp_pd _mm256_cmp_ps #_mm256_cvtepi32_pd _mm256_cvtepi32_ps #_mm256_cvtpd_epi32 #_mm256_cvtpd_ps #_mm256_cvtps_epi32 #_mm256_cvtps_pd #_mm256_cvtsd_f64 #_mm256_cvtsi256_si32 #_mm256_cvtss_f32 #_mm256_cvttpd_epi32 _mm256_cvttps_epi32 _mm256_div_pd _mm256_div_ps #_mm256_dp_ps #_mm256_extract_epi32 #_mm256_extract_epi64 _mm256_extractf128_pd _mm256_extractf128_ps _mm256_extractf128_si256 _mm256_floor_pd _mm256_floor_ps _mm256_hadd_pd _mm256_hadd_ps #_mm256_hsub_pd #_mm256_hsub_ps #_mm256_insert_epi16 #_mm256_insert_epi32 #_mm256_insert_epi64 #_mm256_insert_epi8 _mm256_insertf128_pd _mm256_insertf128_ps _mm256_insertf128_si256 #_mm256_lddqu_si256 _mm256_load_pd _mm256_load_ps _mm256_load_si256 _mm256_loadu_pd _mm256_loadu_ps _mm256_loadu_si256 #_mm256_loadu2_m128 #_mm256_loadu2_m128d #_mm256_loadu2_m128i #_mm256_maskload_pd #_mm256_maskload_ps #_mm256_maskstore_pd #_mm256_maskstore_ps _mm256_max_pd _mm256_max_ps _mm256_min_pd _mm256_min_ps #_mm256_movedup_pd #_mm256_movehdup_ps #_mm256_moveldup_ps #_mm256_movemask_pd #_mm256_movemask_ps _mm256_mul_pd _mm256_mul_ps _mm256_or_pd _mm256_or_ps #_mm256_permute_pd #_mm256_permute_ps _mm256_permute2f128_pd _mm256_permute2f128_ps #_mm256_permute2f128_si256 #_mm256_permutevar_pd #_mm256_permutevar_ps #_mm256_rcp_ps _mm256_round_pd _mm256_round_ps #_mm256_rsqrt_ps #_mm256_set_epi16 #_mm256_set_epi32 _mm256_set_epi64x #_mm256_set_epi8 #_mm256_set_m128 #_mm256_set_m128d #_mm256_set_m128i #_mm256_set_pd #_mm256_set_ps _mm256_set1_epi16 _mm256_set1_epi32 _mm256_set1_epi64x _mm256_set1_epi8 _mm256_set1_pd _mm256_set1_ps _mm256_setr_epi16 _mm256_setr_epi32 #_mm256_setr_epi64x _mm256_setr_epi8 #_mm256_setr_m128 #_mm256_setr_m128d #_mm256_setr_m128i _mm256_setr_pd _mm256_setr_ps #_mm256_setzero_pd #_mm256_setzero_ps _mm256_setzero_si256 #_mm256_shuffle_pd _mm256_shuffle_ps _mm256_sqrt_pd _mm256_sqrt_ps _mm256_store_pd _mm256_store_ps _mm256_store_si256 _mm256_storeu_pd _mm256_storeu_ps _mm256_storeu_si256 #_mm256_storeu2_m128 #_mm256_storeu2_m128d #_mm256_storeu2_m128i #_mm256_stream_pd #_mm256_stream_ps #_mm256_stream_si256 _mm256_sub_pd _mm256_sub_ps _mm256_testc_pd _mm256_testc_ps _mm256_testc_si256 #_mm256_testnzc_pd #_mm256_testnzc_ps #_mm256_testnzc_si256 _mm256_testz_pd _mm256_testz_ps _mm256_testz_si256 #_mm256_undefined_pd #_mm256_undefined_ps #_mm256_undefined_si256 _mm256_unpackhi_pd _mm256_unpackhi_ps _mm256_unpacklo_pd _mm256_unpacklo_ps _mm256_xor_pd _mm256_xor_ps #_mm256_zeroall #_mm256_zeroupper #_mm256_zextpd128_pd256 #_mm256_zextps128_ps256 #_mm256_zextsi128_si256 xtensor-stack-xsimd-541558d/test/avx2.sh000066400000000000000000000067131517435117100201310ustar00rootroot00000000000000#!/bin/sh exit_code=0 for instr in `grep -o -E '^_mm256_[a-z1Z0-9_]+' $0` do if ! grep -q -r $instr ../include-refactoring then echo $instr exit_code=1 fi done exit $exit_code # Instructions below starting with a # are known to be unused in xsimd _mm256_abs_epi16 _mm256_abs_epi32 _mm256_abs_epi8 _mm256_add_epi16 _mm256_add_epi32 _mm256_add_epi64 _mm256_add_epi8 _mm256_adds_epi16 _mm256_adds_epi8 _mm256_adds_epu16 _mm256_adds_epu8 #_mm256_alignr_epi8 _mm256_and_si256 _mm256_andnot_si256 #_mm256_avg_epu16 #_mm256_avg_epu8 _mm256_blend_epi16 _mm256_blend_epi32 _mm256_blendv_epi8 #_mm256_broadcastb_epi8 #_mm256_broadcastd_epi32 #_mm256_broadcastq_epi64 #_mm256_broadcastsd_pd #_mm256_broadcastsi128_si256 #_mm256_broadcastss_ps #_mm256_broadcastw_epi16 #_mm256_bslli_epi128 #_mm256_bsrli_epi128 _mm256_cmpeq_epi16 _mm256_cmpeq_epi32 _mm256_cmpeq_epi64 _mm256_cmpeq_epi8 _mm256_cmpgt_epi16 _mm256_cmpgt_epi32 _mm256_cmpgt_epi64 _mm256_cmpgt_epi8 #_mm256_cvtepi16_epi32 #_mm256_cvtepi16_epi64 #_mm256_cvtepi32_epi64 #_mm256_cvtepi8_epi16 #_mm256_cvtepi8_epi32 #_mm256_cvtepi8_epi64 #_mm256_cvtepu16_epi32 #_mm256_cvtepu16_epi64 #_mm256_cvtepu32_epi64 #_mm256_cvtepu8_epi16 #_mm256_cvtepu8_epi32 #_mm256_cvtepu8_epi64 #_mm256_extract_epi16 #_mm256_extract_epi8 _mm256_extracti128_si256 _mm256_hadd_epi16 _mm256_hadd_epi32 #_mm256_hadds_epi16 #_mm256_hsub_epi16 #_mm256_hsub_epi32 #_mm256_hsubs_epi16 #_mm256_i32gather_epi32 #_mm256_mask_i32gather_epi32 #_mm256_i32gather_epi64 #_mm256_mask_i32gather_epi64 #_mm256_i32gather_pd #_mm256_mask_i32gather_pd #_mm256_i32gather_ps #_mm256_mask_i32gather_ps #_mm256_i64gather_epi32 #_mm256_mask_i64gather_epi32 #_mm256_i64gather_epi64 #_mm256_mask_i64gather_epi64 #_mm256_i64gather_pd #_mm256_mask_i64gather_pd #_mm256_i64gather_ps #_mm256_mask_i64gather_ps #_mm256_inserti128_si256 #_mm256_madd_epi16 #_mm256_maddubs_epi16 #_mm256_maskload_epi32 #_mm256_maskload_epi64 #_mm256_maskstore_epi32 #_mm256_maskstore_epi64 _mm256_max_epi16 _mm256_max_epi32 _mm256_max_epi8 _mm256_max_epu16 _mm256_max_epu32 _mm256_max_epu8 _mm256_min_epi16 _mm256_min_epi32 _mm256_min_epi8 _mm256_min_epu16 _mm256_min_epu32 _mm256_min_epu8 #_mm256_movemask_epi8 #_mm256_mpsadbw_epu8 #_mm256_mul_epi32 #_mm256_mul_epu32 #_mm256_mulhi_epi16 #_mm256_mulhi_epu16 #_mm256_mulhrs_epi16 _mm256_mullo_epi16 _mm256_mullo_epi32 _mm256_or_si256 #_mm256_packs_epi16 #_mm256_packs_epi32 #_mm256_packus_epi16 #_mm256_packus_epi32 #_mm256_permute2x128_si256 #_mm256_permute4x64_epi64 #_mm256_permute4x64_pd #_mm256_permutevar8x32_epi32 #_mm256_permutevar8x32_ps #_mm256_sad_epu8 #_mm256_shuffle_epi32 #_mm256_shuffle_epi8 #_mm256_shufflehi_epi16 #_mm256_shufflelo_epi16 #_mm256_sign_epi16 #_mm256_sign_epi32 #_mm256_sign_epi8 #_mm256_sll_epi16 #_mm256_sll_epi32 #_mm256_sll_epi64 _mm256_slli_epi16 _mm256_slli_epi32 _mm256_slli_epi64 #_mm256_slli_si256 _mm256_sllv_epi32 _mm256_sllv_epi64 #_mm256_sra_epi16 #_mm256_sra_epi32 _mm256_srai_epi16 _mm256_srai_epi32 _mm256_srav_epi32 #_mm256_srl_epi16 #_mm256_srl_epi32 #_mm256_srl_epi64 _mm256_srli_epi16 _mm256_srli_epi32 _mm256_srli_epi64 #_mm256_srli_si256 _mm256_srlv_epi32 _mm256_srlv_epi64 #_mm256_stream_load_si256 _mm256_sub_epi16 _mm256_sub_epi32 _mm256_sub_epi64 _mm256_sub_epi8 _mm256_subs_epi16 _mm256_subs_epi8 _mm256_subs_epu16 _mm256_subs_epu8 _mm256_unpackhi_epi16 _mm256_unpackhi_epi32 _mm256_unpackhi_epi64 _mm256_unpackhi_epi8 _mm256_unpacklo_epi16 _mm256_unpacklo_epi32 _mm256_unpacklo_epi64 _mm256_unpacklo_epi8 _mm256_xor_si256 xtensor-stack-xsimd-541558d/test/check_arch.sh000066400000000000000000000022241517435117100213140ustar00rootroot00000000000000#!/bin/sh set -e CXX=g++ printf "int main() { return 0;}" > sanity_check.cpp printf "#include \nint main() { return 0;}" > xsimd_check.cpp sed -n '/x86[-]64/,$ p' $0 | \ while read arch; do \ if echo $arch | grep -q '#' ; then continue; fi ; \ echo "# $arch" ; \ $CXX -w -march=$arch sanity_check.cpp -fsyntax-only ; \ $CXX -w -I../include -march=$arch xsimd_check.cpp -fsyntax-only ; \ done rm sanity_check.cpp xsimd_check.cpp exit 0 nocona core2 nehalem corei7 westmere sandybridge corei7-avx ivybridge core-avx-i haswell core-avx2 broadwell skylake skylake-avx512 cannonlake icelake-client rocketlake icelake-server cascadelake tigerlake cooperlake sapphirerapids emeraldrapids alderlake raptorlake meteorlake graniterapids graniterapids-d bonnell atom silvermont slm goldmont goldmont-plus tremont gracemont sierraforest grandridge knl knm x86-64 x86-64-v2 x86-64-v3 x86-64-v4 eden-x2 nano nano-1000 nano-2000 nano-3000 nano-x2 eden-x4 nano-x4 lujiazui k8 k8-sse3 opteron opteron-sse3 athlon64 athlon64-sse3 athlon-fx amdfam10 barcelona bdver1 bdver2 bdver3 bdver4 znver1 znver2 znver3 znver4 btver1 btver2 xtensor-stack-xsimd-541558d/test/check_inline_specifier.sh000077500000000000000000000021451517435117100237130ustar00rootroot00000000000000#!/bin/sh # # Usage: $0 top_srcdir # # This script walks all headers in $top_srcdir/include and makes sure that all # functions declared there are marked as inline or constexpr (which implies # inline). This makes sure the xsimd headers does not define symbol with global # linkage, and somehow convey our itnent to have all functions in xsimd being # inlined by the compiler. set -e set -x which clang-query || { echo "missing dependency: clang-query" 1>&2 ; exit 1; } top_srcdir=$1 query_file=`mktemp -t` sed -r -n '/^####/,$ p' < $0 > $query_file log_file=`mktemp -t` clang-query --extra-arg "-std=c++14" --extra-arg="-I$top_srcdir/include" -f $query_file $top_srcdir/include/xsimd/xsimd.hpp -- | tee $log_file { grep -E '^0 matches.' $log_file && failed=0 ; } || failed=1 rm -f $query_file rm -f $log_file exit $failed #### clang-query commands #### set traversal IgnoreUnlessSpelledInSource set print-matcher false set bind-root false enable output diag match functionDecl(isExpansionInFileMatching(".*/xsimd/.*"), isDefinition(), unless(isInline()), unless(isConstexpr())).bind("inline-function") xtensor-stack-xsimd-541558d/test/doc/000077500000000000000000000000001517435117100174535ustar00rootroot00000000000000xtensor-stack-xsimd-541558d/test/doc/CMakeLists.txt000066400000000000000000000025511517435117100222160ustar00rootroot00000000000000# Only test under some architecture, because it's just a sanity check, no full # coverage is needed. if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT CMAKE_OSX_ARCHITECTURES) if(NOT TARGET xsimd) find_package(xsimd REQUIRED CONFIG) endif() if(ENABLE_XTL_COMPLEX) add_compile_definitions(XSIMD_ENABLE_XTL_COMPLEX=1) include_directories(${xtl_INCLUDE_DIRS}) endif() add_library(test_doc_any_arch OBJECT explicit_use_of_an_instruction_set_mean_aligned.cpp explicit_use_of_an_instruction_set_mean_arch_independent.cpp explicit_use_of_an_instruction_set_mean.cpp explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp manipulating_abstract_batches.cpp manipulating_parametric_batches.cpp writing_vectorized_code.cpp) target_link_libraries(test_doc_any_arch PRIVATE xsimd) target_compile_options(test_doc_any_arch PRIVATE -mavx) add_library(test_doc_avx2 OBJECT explicit_use_of_an_instruction_set.cpp sum_avx2.cpp) target_link_libraries(test_doc_avx2 PRIVATE xsimd) target_compile_options(test_doc_avx2 PRIVATE -mavx2) add_library(test_doc_sse2 OBJECT sum_sse2.cpp) target_link_libraries(test_doc_sse2 PRIVATE xsimd) target_compile_options(test_doc_sse2 PRIVATE -msse2) add_dependencies(xtest test_doc_any_arch test_doc_avx2 test_doc_sse2) endif() xtensor-stack-xsimd-541558d/test/doc/explicit_use_of_an_instruction_set.cpp000066400000000000000000000004471517435117100273370ustar00rootroot00000000000000#include "xsimd/xsimd.hpp" #include namespace xs = xsimd; int main(int, char*[]) { xs::batch a = { 1.5, 2.5, 3.5, 4.5 }; xs::batch b = { 2.5, 3.5, 4.5, 5.5 }; auto mean = (a + b) / 2; std::cout << mean << std::endl; return 0; } xtensor-stack-xsimd-541558d/test/doc/explicit_use_of_an_instruction_set_mean.cpp000066400000000000000000000014161517435117100303340ustar00rootroot00000000000000#include "xsimd/xsimd.hpp" #include #include void mean(const std::vector& a, const std::vector& b, std::vector& res) { using b_type = xsimd::batch; std::size_t inc = b_type::size; std::size_t size = res.size(); // size for which the vectorization is possible std::size_t vec_size = size - size % inc; for (std::size_t i = 0; i < vec_size; i += inc) { b_type avec = b_type::load_unaligned(&a[i]); b_type bvec = b_type::load_unaligned(&b[i]); b_type rvec = (avec + bvec) / 2; rvec.store_unaligned(&res[i]); } // Remaining part that cannot be vectorize for (std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2; } } xtensor-stack-xsimd-541558d/test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp000066400000000000000000000014571517435117100320240ustar00rootroot00000000000000#include "xsimd/xsimd.hpp" #include #include using vector_type = std::vector>; void mean(const vector_type& a, const vector_type& b, vector_type& res) { using b_type = xsimd::batch; std::size_t inc = b_type::size; std::size_t size = res.size(); // size for which the vectorization is possible std::size_t vec_size = size - size % inc; for (std::size_t i = 0; i < vec_size; i += inc) { b_type avec = b_type::load_aligned(&a[i]); b_type bvec = b_type::load_aligned(&b[i]); b_type rvec = (avec + bvec) / 2; rvec.store_aligned(&res[i]); } // Remaining part that cannot be vectorize for (std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2; } } xtensor-stack-xsimd-541558d/test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp000066400000000000000000000015371517435117100337120ustar00rootroot00000000000000#include "xsimd/xsimd.hpp" #include struct mean { template void operator()(Arch, const C& a, const C& b, C& res, Tag) { using b_type = xsimd::batch; std::size_t inc = b_type::size; std::size_t size = res.size(); // size for which the vectorization is possible std::size_t vec_size = size - size % inc; for (std::size_t i = 0; i < vec_size; i += inc) { b_type avec = b_type::load(&a[i], Tag()); b_type bvec = b_type::load(&b[i], Tag()); b_type rvec = (avec + bvec) / 2; xsimd::store(&res[i], rvec, Tag()); } // Remaining part that cannot be vectorize for (std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2; } } }; xtensor-stack-xsimd-541558d/test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp000066400000000000000000000013341517435117100330450ustar00rootroot00000000000000#include "xsimd/xsimd.hpp" #include template void mean(const C& a, const C& b, C& res, Tag) { using b_type = xsimd::batch; std::size_t inc = b_type::size; std::size_t size = res.size(); // size for which the vectorization is possible std::size_t vec_size = size - size % inc; for (std::size_t i = 0; i < vec_size; i += inc) { b_type avec = b_type::load(&a[i], Tag()); b_type bvec = b_type::load(&b[i], Tag()); b_type rvec = (avec + bvec) / 2; xsimd::store(&res[i], rvec, Tag()); } // Remaining part that cannot be vectorize for (std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2; } } xtensor-stack-xsimd-541558d/test/doc/manipulating_abstract_batches.cpp000066400000000000000000000002241517435117100262210ustar00rootroot00000000000000#include "xsimd/xsimd.hpp" namespace xs = xsimd; xs::batch mean(xs::batch lhs, xs::batch rhs) { return (lhs + rhs) / 2; } xtensor-stack-xsimd-541558d/test/doc/manipulating_parametric_batches.cpp000066400000000000000000000002711517435117100265470ustar00rootroot00000000000000#include "xsimd/xsimd.hpp" namespace xs = xsimd; template xs::batch mean(xs::batch lhs, xs::batch rhs) { return (lhs + rhs) / 2; } xtensor-stack-xsimd-541558d/test/doc/sum.hpp000066400000000000000000000021231517435117100207660ustar00rootroot00000000000000#ifndef _SUM_HPP #define _SUM_HPP #include "xsimd/xsimd.hpp" // functor with a call method that depends on `Arch` struct sum { // It's critical not to use an in-class definition here. // In-class and inline definition bypass extern template mechanism. template T operator()(Arch, T const* data, unsigned size); }; template T sum::operator()(Arch, T const* data, unsigned size) { using batch = xsimd::batch; batch acc(static_cast(0)); const unsigned n = size / batch::size * batch::size; for (unsigned i = 0; i != n; i += batch::size) acc += batch::load_unaligned(data + i); T star_acc = xsimd::reduce_add(acc); for (unsigned i = n; i < size; ++i) star_acc += data[i]; return star_acc; } // Inform the compiler that sse2 and avx2 implementation are to be found in another compilation unit. extern template float sum::operator()(xsimd::avx2, float const*, unsigned); extern template float sum::operator()(xsimd::sse2, float const*, unsigned); #endif xtensor-stack-xsimd-541558d/test/doc/sum_avx2.cpp000066400000000000000000000002031517435117100217160ustar00rootroot00000000000000// compile with -mavx2 #include "sum.hpp" template float sum::operator()(xsimd::avx2, float const*, unsigned); xtensor-stack-xsimd-541558d/test/doc/sum_sse2.cpp000066400000000000000000000002031517435117100217120ustar00rootroot00000000000000// compile with -msse2 #include "sum.hpp" template float sum::operator()(xsimd::sse2, float const*, unsigned); xtensor-stack-xsimd-541558d/test/doc/writing_vectorized_code.cpp000066400000000000000000000004101517435117100250650ustar00rootroot00000000000000#include #include void mean(const std::vector& a, const std::vector& b, std::vector& res) { std::size_t size = res.size(); for (std::size_t i = 0; i < size; ++i) { res[i] = (a[i] + b[i]) / 2; } } xtensor-stack-xsimd-541558d/test/main.cpp000066400000000000000000000021641517435117100203410ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef EMSCRIPTEN #define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN #include "doctest/doctest.h" #else #define DOCTEST_CONFIG_IMPLEMENT #include "doctest/doctest.h" #include int run_tests() { doctest::Context context; return context.run(); } EMSCRIPTEN_BINDINGS(my_module) { emscripten::function("run_tests", &run_tests); } #endifxtensor-stack-xsimd-541558d/test/sse.sh000066400000000000000000000042541517435117100200410ustar00rootroot00000000000000#!/bin/sh exit_code=0 for instr in `grep -o -E '^_mm_[a-z1Z0-9_]+' $0` do if ! grep -q -r $instr ../include-refactoring then echo $instr exit_code=1 fi done exit $exit_code # Instructions below starting with a # are known to be unused in xsimd #_mm_add_ss _mm_and_ps _mm_andnot_ps #_mm_avg_pu16 #_mm_avg_pu8 _mm_cmpeq_ps #_mm_cmpeq_ss _mm_cmpge_ps #_mm_cmpge_ss _mm_cmpgt_ps #_mm_cmpgt_ss _mm_cmple_ps #_mm_cmple_ss _mm_cmplt_ps #_mm_cmplt_ss #_mm_cmpneq_ps #_mm_cmpneq_ss #_mm_cmpnge_ps #_mm_cmpnge_ss #_mm_cmpngt_ps #_mm_cmpngt_ss #_mm_cmpnle_ps #_mm_cmpnle_ss #_mm_cmpnlt_ps #_mm_cmpnlt_ss #_mm_cmpord_ps #_mm_cmpord_ss _mm_cmpunord_ps #_mm_cmpunord_ss #_mm_comieq_ss #_mm_comige_ss #_mm_comigt_ss #_mm_comile_ss #_mm_comilt_ss #_mm_comineq_ss #_mm_cvt_pi2ps #_mm_cvt_ps2pi #_mm_cvt_si2ss #_mm_cvt_ss2si #_mm_cvtpi16_ps #_mm_cvtpi32_ps #_mm_cvtpi32x2_ps #_mm_cvtpi8_ps #_mm_cvtps_pi16 #_mm_cvtps_pi32 #_mm_cvtps_pi8 #_mm_cvtpu16_ps #_mm_cvtpu8_ps #_mm_cvtsi32_ss #_mm_cvtsi64_ss #_mm_cvtss_f32 #_mm_cvtss_si32 #_mm_cvtss_si64 #_mm_cvtt_ps2pi #_mm_cvtt_ss2si #_mm_cvttps_pi32 #_mm_cvttss_si32 #_mm_cvttss_si64 _mm_div_ps #_mm_div_ss #_mm_extract_pi16 #_mm_free #_mm_getcsr #_mm_insert_pi16 _mm_load_ps #_mm_load_ps1 #_mm_load_ss #_mm_load1_ps #_mm_loadh_pi #_mm_loadl_pi #_mm_loadr_ps _mm_loadu_ps #_mm_loadu_si16 #_mm_loadu_si64 #_mm_malloc #_mm_maskmove_si64 #_mm_max_pi16 _mm_max_ps #_mm_max_pu8 #_mm_max_ss #_mm_min_pi16 _mm_min_ps #_mm_min_pu8 #_mm_min_ss #_mm_move_ss #_mm_movehl_ps #_mm_movelh_ps #_mm_movemask_pi8 #_mm_movemask_ps _mm_mul_ps #_mm_mul_ss #_mm_mulhi_pu16 _mm_or_ps #_mm_prefetch #_mm_rcp_ps #_mm_rcp_ss #_mm_rsqrt_ps #_mm_rsqrt_ss #_mm_sad_pu8 #_mm_set_ps #_mm_set_ps1 #_mm_set_ss #_mm_set1_ps #_mm_setcsr _mm_setr_ps #_mm_setzero_ps #_mm_sfence #_mm_shuffle_pi16 #_mm_shuffle_ps _mm_sqrt_ps #_mm_sqrt_ss _mm_store_ps #_mm_store_ps1 #_mm_store_ss #_mm_store1_ps #_mm_storeh_pi #_mm_storel_pi #_mm_storer_ps _mm_storeu_ps #_mm_storeu_si16 #_mm_storeu_si64 #_mm_stream_pi #_mm_stream_ps _mm_sub_ps #_mm_sub_ss #_mm_ucomieq_ss #_mm_ucomige_ss #_mm_ucomigt_ss #_mm_ucomile_ss #_mm_ucomilt_ss #_mm_ucomineq_ss #_mm_undefined_ps _mm_unpackhi_ps _mm_unpacklo_ps _mm_xor_ps xtensor-stack-xsimd-541558d/test/sse2.sh000066400000000000000000000073721517435117100201270ustar00rootroot00000000000000#!/bin/sh exit_code=0 for instr in `grep -o -E '^_mm_[a-z1Z0-9_]+' $0` do if ! grep -q -r $instr ../include-refactoring then echo $instr exit_code=1 fi done exit $exit_code # Instructions below starting with a # are known to be unused in xsimd _mm_add_epi16 _mm_add_epi32 _mm_add_epi64 _mm_add_epi8 _mm_add_pd _mm_add_sd #_mm_add_si64 _mm_adds_epi16 _mm_adds_epi8 _mm_adds_epu16 _mm_adds_epu8 _mm_and_pd _mm_and_si128 _mm_andnot_pd _mm_andnot_si128 #_mm_avg_epu16 #_mm_avg_epu8 #_mm_bslli_si128 #_mm_bsrli_si128 _mm_castpd_ps _mm_castpd_si128 _mm_castps_pd _mm_castps_si128 _mm_castsi128_pd _mm_castsi128_ps #_mm_clflush _mm_cmpeq_epi16 _mm_cmpeq_epi32 _mm_cmpeq_epi8 _mm_cmpeq_pd #_mm_cmpeq_sd _mm_cmpge_pd #_mm_cmpge_sd _mm_cmpgt_epi16 _mm_cmpgt_epi32 _mm_cmpgt_epi8 _mm_cmpgt_pd #_mm_cmpgt_sd _mm_cmple_pd #_mm_cmple_sd _mm_cmplt_epi16 _mm_cmplt_epi32 _mm_cmplt_epi8 _mm_cmplt_pd #_mm_cmplt_sd _mm_cmpneq_pd #_mm_cmpneq_sd #_mm_cmpnge_pd #_mm_cmpnge_sd #_mm_cmpngt_pd #_mm_cmpngt_sd #_mm_cmpnle_pd #_mm_cmpnle_sd #_mm_cmpnlt_pd #_mm_cmpnlt_sd #_mm_cmpord_pd #_mm_cmpord_sd _mm_cmpunord_pd #_mm_cmpunord_sd #_mm_comieq_sd #_mm_comige_sd #_mm_comigt_sd #_mm_comile_sd #_mm_comilt_sd #_mm_comineq_sd #_mm_cvtepi32_pd _mm_cvtepi32_ps #_mm_cvtpd_epi32 #_mm_cvtpd_pi32 #_mm_cvtpd_ps #_mm_cvtpi32_pd _mm_cvtps_epi32 #_mm_cvtps_pd #_mm_cvtsd_f64 #_mm_cvtsd_si32 #_mm_cvtsd_si64 #_mm_cvtsd_si64x #_mm_cvtsd_ss #_mm_cvtsi128_si32 #_mm_cvtsi128_si64 #_mm_cvtsi128_si64x #_mm_cvtsi32_sd #_mm_cvtsi32_si128 #_mm_cvtsi64_sd #_mm_cvtsi64_si128 #_mm_cvtsi64x_sd #_mm_cvtsi64x_si128 #_mm_cvtss_sd #_mm_cvttpd_epi32 #_mm_cvttpd_pi32 _mm_cvttps_epi32 #_mm_cvttsd_si32 #_mm_cvttsd_si64 #_mm_cvttsd_si64x _mm_div_pd #_mm_div_sd #_mm_extract_epi16 #_mm_insert_epi16 #_mm_lfence _mm_load_pd #_mm_load_pd1 #_mm_load_sd _mm_load_si128 #_mm_load1_pd #_mm_loadh_pd #_mm_loadl_epi64 #_mm_loadl_pd #_mm_loadr_pd _mm_loadu_pd _mm_loadu_si128 #_mm_loadu_si32 #_mm_madd_epi16 #_mm_maskmoveu_si128 _mm_max_epi16 _mm_max_epu8 _mm_max_pd #_mm_max_sd #_mm_mfence _mm_min_epi16 _mm_min_epu8 _mm_min_pd #_mm_min_sd #_mm_move_epi64 #_mm_move_sd #_mm_movemask_epi8 #_mm_movemask_pd #_mm_movepi64_pi64 #_mm_movpi64_epi64 _mm_mul_epu32 _mm_mul_pd #_mm_mul_sd #_mm_mul_su32 #_mm_mulhi_epi16 #_mm_mulhi_epu16 _mm_mullo_epi16 _mm_or_pd _mm_or_si128 #_mm_packs_epi16 #_mm_packs_epi32 #_mm_packus_epi16 #_mm_pause #_mm_sad_epu8 #_mm_set_epi16 #_mm_set_epi32 #_mm_set_epi64 #_mm_set_epi64x #_mm_set_epi8 #_mm_set_pd #_mm_set_pd1 #_mm_set_sd _mm_set1_epi16 _mm_set1_epi32 _mm_set1_epi64 _mm_set1_epi64x _mm_set1_epi8 _mm_set1_pd _mm_setr_epi16 _mm_setr_epi32 #_mm_setr_epi64 _mm_setr_epi8 _mm_setr_pd #_mm_setzero_pd _mm_setzero_si128 _mm_shuffle_epi32 _mm_shuffle_pd #_mm_shufflehi_epi16 #_mm_shufflelo_epi16 #_mm_sll_epi16 #_mm_sll_epi32 #_mm_sll_epi64 _mm_slli_epi16 _mm_slli_epi32 _mm_slli_epi64 #_mm_slli_si128 _mm_sqrt_pd #_mm_sqrt_sd #_mm_sra_epi16 #_mm_sra_epi32 _mm_srai_epi16 _mm_srai_epi32 #_mm_srl_epi16 #_mm_srl_epi32 #_mm_srl_epi64 _mm_srli_epi16 _mm_srli_epi32 _mm_srli_epi64 #_mm_srli_si128 _mm_store_pd #_mm_store_pd1 #_mm_store_sd _mm_store_si128 #_mm_store1_pd #_mm_storeh_pd #_mm_storel_epi64 #_mm_storel_pd #_mm_storer_pd _mm_storeu_pd _mm_storeu_si128 #_mm_storeu_si32 #_mm_stream_pd #_mm_stream_si128 #_mm_stream_si32 #_mm_stream_si64 _mm_sub_epi16 _mm_sub_epi32 _mm_sub_epi64 _mm_sub_epi8 _mm_sub_pd #_mm_sub_sd #_mm_sub_si64 _mm_subs_epi16 _mm_subs_epi8 _mm_subs_epu16 _mm_subs_epu8 #_mm_ucomieq_sd #_mm_ucomige_sd #_mm_ucomigt_sd #_mm_ucomile_sd #_mm_ucomilt_sd #_mm_ucomineq_sd #_mm_undefined_pd #_mm_undefined_si128 _mm_unpackhi_epi16 _mm_unpackhi_epi32 _mm_unpackhi_epi64 _mm_unpackhi_epi8 _mm_unpackhi_pd _mm_unpacklo_epi16 _mm_unpacklo_epi32 _mm_unpacklo_epi64 _mm_unpacklo_epi8 _mm_unpacklo_pd _mm_xor_pd _mm_xor_si128 xtensor-stack-xsimd-541558d/test/sse3.sh000066400000000000000000000006641517435117100201250ustar00rootroot00000000000000#!/bin/sh exit_code=0 for instr in `grep -o -E '^_mm_[a-z1Z0-9_]+' $0` do if ! grep -q -r $instr ../include-refactoring then echo $instr exit_code=1 fi done exit $exit_code # Instructions below starting with a # are known to be unused in xsimd #_mm_addsub_pd #_mm_addsub_ps _mm_hadd_pd _mm_hadd_ps #_mm_hsub_pd #_mm_hsub_ps #_mm_lddqu_si128 #_mm_loaddup_pd #_mm_movedup_pd #_mm_movehdup_ps #_mm_moveldup_ps xtensor-stack-xsimd-541558d/test/sse4_1.sh000066400000000000000000000023601517435117100203410ustar00rootroot00000000000000#!/bin/sh exit_code=0 for instr in `grep -o -E '^_mm_[a-z1Z0-9_]+' $0` do if ! grep -q -r $instr ../include-refactoring then echo $instr exit_code=1 fi done exit $exit_code # Instructions below starting with a # are known to be unused in xsimd _mm_blend_epi16 _mm_blend_pd _mm_blend_ps _mm_blendv_epi8 _mm_blendv_pd _mm_blendv_ps _mm_ceil_pd _mm_ceil_ps #_mm_ceil_sd #_mm_ceil_ss _mm_cmpeq_epi64 #_mm_cvtepi16_epi32 #_mm_cvtepi16_epi64 #_mm_cvtepi32_epi64 #_mm_cvtepi8_epi16 #_mm_cvtepi8_epi32 #_mm_cvtepi8_epi64 #_mm_cvtepu16_epi32 #_mm_cvtepu16_epi64 #_mm_cvtepu32_epi64 #_mm_cvtepu8_epi16 #_mm_cvtepu8_epi32 #_mm_cvtepu8_epi64 #_mm_dp_pd #_mm_dp_ps #_mm_extract_epi32 #_mm_extract_epi64 #_mm_extract_epi8 #_mm_extract_ps _mm_floor_pd _mm_floor_ps #_mm_floor_sd #_mm_floor_ss #_mm_insert_epi32 #_mm_insert_epi64 #_mm_insert_epi8 #_mm_insert_ps _mm_max_epi32 _mm_max_epi8 _mm_max_epu16 _mm_max_epu32 _mm_min_epi32 _mm_min_epi8 _mm_min_epu16 _mm_min_epu32 #_mm_minpos_epu16 #_mm_mpsadbw_epu8 #_mm_mul_epi32 _mm_mullo_epi32 #_mm_packus_epi32 _mm_round_pd _mm_round_ps #_mm_round_sd #_mm_round_ss #_mm_stream_load_si128 #_mm_test_all_ones #_mm_test_all_zeros #_mm_test_mix_ones_zeros #_mm_testc_si128 #_mm_testnzc_si128 #_mm_testz_si128 xtensor-stack-xsimd-541558d/test/sse4_2.sh000066400000000000000000000010401517435117100203340ustar00rootroot00000000000000#!/bin/sh exit_code=0 for instr in `grep -o -E '^_mm_[a-z1Z0-9_]+' $0` do if ! grep -q -r $instr ../include-refactoring then echo $instr exit_code=1 fi done exit $exit_code # Instructions below starting with a # are known to be unused in xsimd #_mm_cmpestra #_mm_cmpestrc #_mm_cmpestri #_mm_cmpestrm #_mm_cmpestro #_mm_cmpestrs #_mm_cmpestrz _mm_cmpgt_epi64 #_mm_cmpistra #_mm_cmpistrc #_mm_cmpistri #_mm_cmpistrm #_mm_cmpistro #_mm_cmpistrs #_mm_cmpistrz #_mm_crc32_u16 #_mm_crc32_u32 #_mm_crc32_u64 #_mm_crc32_u8 xtensor-stack-xsimd-541558d/test/ssse3.sh000066400000000000000000000014061517435117100203030ustar00rootroot00000000000000#!/bin/sh exit_code=0 for instr in `grep -o -E '^_mm_[a-z1Z0-9_]+' $0` do if ! grep -q -r $instr ../include-refactoring then echo $instr exit_code=1 fi done exit $exit_code # Instructions below starting with a # are known to be unused in xsimd _mm_abs_epi16 _mm_abs_epi32 _mm_abs_epi8 #_mm_abs_pi16 #_mm_abs_pi32 #_mm_abs_pi8 #_mm_alignr_epi8 #_mm_alignr_pi8 _mm_hadd_epi16 _mm_hadd_epi32 #_mm_hadd_pi16 #_mm_hadd_pi32 #_mm_hadds_epi16 #_mm_hadds_pi16 #_mm_hsub_epi16 #_mm_hsub_epi32 #_mm_hsub_pi16 #_mm_hsub_pi32 #_mm_hsubs_epi16 #_mm_hsubs_pi16 #_mm_maddubs_epi16 #_mm_maddubs_pi16 #_mm_mulhrs_epi16 #_mm_mulhrs_pi16 #_mm_shuffle_epi8 #_mm_shuffle_pi8 #_mm_sign_epi16 #_mm_sign_epi32 #_mm_sign_epi8 #_mm_sign_pi16 #_mm_sign_pi32 #_mm_sign_pi8 xtensor-stack-xsimd-541558d/test/test_api.cpp000066400000000000000000000166251517435117100212340ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/xsimd.hpp" #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include #include #include #include "test_utils.hpp" template struct xsimd_api_test { using batch_type = B; using batch_bool_type = typename B::batch_bool_type; using arch_type = typename B::arch_type; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using int8_vector_type = std::vector>; using uint8_vector_type = std::vector>; using int16_vector_type = std::vector>; using uint16_vector_type = std::vector>; using int32_vector_type = std::vector>; using uint32_vector_type = std::vector>; using int64_vector_type = std::vector>; using uint64_vector_type = std::vector>; using float_vector_type = std::vector>; using double_vector_type = std::vector>; int8_vector_type i8_vec; uint8_vector_type ui8_vec; int16_vector_type i16_vec; uint16_vector_type ui16_vec; int32_vector_type i32_vec; uint32_vector_type ui32_vec; int64_vector_type i64_vec; uint64_vector_type ui64_vec; float_vector_type f_vec; double_vector_type d_vec; array_type expected; xsimd_api_test() { init_test_vector(i8_vec); init_test_vector(ui8_vec); init_test_vector(i16_vec); init_test_vector(ui16_vec); init_test_vector(i32_vec); init_test_vector(ui32_vec); init_test_vector(i64_vec); init_test_vector(ui64_vec); init_test_vector(f_vec); #if XSIMD_WITH_NEON64 || !XSIMD_WITH_NEON init_test_vector(d_vec); #endif } void test_load() { test_load_impl(i8_vec, "load int8_t"); test_load_impl(ui8_vec, "load uint8_t"); test_load_impl(i16_vec, "load int16_t"); test_load_impl(ui16_vec, "load uint16_t"); test_load_impl(i32_vec, "load int32_t"); test_load_impl(ui32_vec, "load uint32_t"); test_load_impl(i64_vec, "load int64_t"); test_load_impl(ui64_vec, "load uint64_t"); test_load_impl(f_vec, "load float"); #if XSIMD_WITH_NEON64 || !XSIMD_WITH_NEON test_load_impl(d_vec, "load double"); #endif } void test_store() { test_store_impl(i8_vec, "store int8_t"); test_store_impl(ui8_vec, "store uint8_t"); test_store_impl(i16_vec, "store int16_t"); test_store_impl(ui16_vec, "store uint16_t"); test_store_impl(i32_vec, "store int32_t"); test_store_impl(ui32_vec, "store uint32_t"); test_store_impl(i64_vec, "store int64_t"); test_store_impl(ui64_vec, "store uint64_t"); test_store_impl(f_vec, "store float"); #if XSIMD_WITH_NEON64 || !XSIMD_WITH_NEON test_store_impl(d_vec, "store double"); #endif } void test_set() { test_set_bool("set bool"); test_set_impl("set int8_t"); test_set_impl("set uint8_t"); test_set_impl("set int16_t"); test_set_impl("set uint16_t"); test_set_impl("set int32_t"); test_set_impl("set uint32_t"); test_set_impl("set int64_t"); test_set_impl("set uint64_t"); test_set_impl("set float"); #if XSIMD_WITH_NEON64 || !XSIMD_WITH_NEON test_set_impl("set double"); #endif } private: template void test_load_impl(const V& v, const std::string& name) { batch_type b; std::copy(v.cbegin(), v.cend(), expected.begin()); b = batch_type::load(v.data(), xsimd::unaligned_mode()); INFO(name, " unaligned"); CHECK_BATCH_EQ(b, expected); b = batch_type::load(v.data(), xsimd::aligned_mode()); INFO(name, " aligned"); CHECK_BATCH_EQ(b, expected); } template void test_store_impl(const V& v, const std::string& name) { batch_type b = batch_type::load(v.data(), xsimd::aligned_mode()); V res(size); alignas(arch_type::alignment()) bool b_data[size]; xsimd::store_as(res.data(), b, xsimd::unaligned_mode()); INFO(name, " unaligned"); CHECK_VECTOR_EQ(res, v); std::fill(b_data, b_data + size, false); batch_bool_type bb = (b == b); xsimd::store_as(b_data, bb, xsimd::unaligned_mode()); INFO(name, " batch_bool unaligned"); CHECK_UNARY(std::accumulate(b_data, b_data + size, true, std::logical_and())); xsimd::store_as(res.data(), b, xsimd::aligned_mode()); INFO(name, " aligned"); CHECK_VECTOR_EQ(res, v); std::fill(b_data, b_data + size, false); bb = (b == b); xsimd::store_as(b_data, bb, xsimd::aligned_mode()); INFO(name, " batch_bool aligned"); CHECK_UNARY(std::accumulate(b_data, b_data + size, true, std::logical_and())); } template void test_set_impl(const std::string& name) { #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wshorten-64-to-32" #endif T v = T(1); batch_type expected(v); batch_type res = xsimd::broadcast(v); #ifdef __clang__ #pragma clang diagnostic pop #endif INFO(name); CHECK_BATCH_EQ(res, expected); } void test_set_bool(const std::string& name) { bool v = true; xsimd::batch_bool expected(v); xsimd::batch_bool res = xsimd::broadcast(v); INFO(name); CHECK_BATCH_EQ(res, expected); } template void init_test_vector(V& vec) { vec.resize(size); int min = 0; int max = 100; std::default_random_engine generator; std::uniform_int_distribution distribution(min, max); auto gen = [&distribution, &generator]() { return static_cast(distribution(generator)); }; std::generate(vec.begin(), vec.end(), gen); } }; TEST_CASE_TEMPLATE("[basic api]", B, BATCH_TYPES) { xsimd_api_test Test; SUBCASE("load") { Test.test_load(); } SUBCASE("store") { Test.test_store(); } SUBCASE("set") { Test.test_set(); } } #endif xtensor-stack-xsimd-541558d/test/test_arch.cpp000066400000000000000000000203321517435117100213660ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/xsimd.hpp" #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include #include #include "test_sum.hpp" #include "test_utils.hpp" #ifndef XSIMD_DEFAULT_ARCH static_assert(xsimd::default_arch::supported(), "default arch must be supported"); static_assert(std::is_same::value, "default arch is the best available"); static_assert(xsimd::supported_architectures::contains(), "default arch is supported"); static_assert(xsimd::all_architectures::contains(), "default arch is a valid arch"); #endif #if !XSIMD_WITH_SVE static_assert((std::is_base_of::value || !xsimd::neon64::supported()), "on arm, without sve, the best we can do is neon64"); #endif struct check_supported { template void operator()(Arch) const { static_assert(Arch::supported(), "not supported?"); } }; struct check_cpu_has_intruction_set { template void operator()(Arch arch) const { static_assert(std::is_same::value, "cannot test instruction set availability on CPU"); } }; struct check_available { template void operator()(Arch) const { CHECK_UNARY(Arch::available()); } }; template static bool try_load() { static_assert(std::is_same, decltype(xsimd::load_aligned(std::declval()))>::value, "loading the expected type"); static_assert(std::is_same, decltype(xsimd::load_unaligned(std::declval()))>::value, "loading the expected type"); return true; } template void try_loads() { (void)std::initializer_list { try_load()... }; } TEST_CASE("[multi arch support]") { SUBCASE("xsimd::supported_architectures") { xsimd::supported_architectures::for_each(check_supported {}); } SUBCASE("xsimd::available_architectures::has") { xsimd::all_architectures::for_each(check_cpu_has_intruction_set {}); } SUBCASE("xsimd::default_arch::name") { constexpr char const* name = xsimd::default_arch::name(); (void)name; } SUBCASE("xsimd::default_arch::available") { CHECK_UNARY(xsimd::default_arch::available()); } SUBCASE("xsimd::arch_list<...>::alignment()") { static_assert(xsimd::arch_list::alignment() == 0, "common"); static_assert(xsimd::arch_list::alignment() == xsimd::sse2::alignment(), "one architecture"); static_assert(xsimd::arch_list::alignment() == xsimd::avx512f::alignment(), "two architectures"); } SUBCASE("xsimd::dispatch(...)") { float data[17] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f }; float ref = std::accumulate(std::begin(data), std::end(data), 0.f); // platform specific { auto dispatched = xsimd::dispatch(sum {}); float res = dispatched(data, 17); CHECK_EQ(ref, res); } // only highest available { auto dispatched = xsimd::dispatch>(sum {}); float res = dispatched(data, 17); CHECK_EQ(ref, res); } #if XSIMD_WITH_AVX && XSIMD_WITH_SSE2 static_assert(xsimd::supported_architectures::contains() && xsimd::supported_architectures::contains(), "consistent supported architectures"); { auto dispatched = xsimd::dispatch>(sum {}); float res = dispatched(data, 17); CHECK_EQ(ref, res); } #endif } SUBCASE("xsimd::make_sized_batch_t") { using batch4f = xsimd::make_sized_batch_t; using batch2d = xsimd::make_sized_batch_t; using batch4c = xsimd::make_sized_batch_t, 4>; using batch2z = xsimd::make_sized_batch_t, 2>; using batch4i32 = xsimd::make_sized_batch_t; using batch4u32 = xsimd::make_sized_batch_t; using batch8f = xsimd::make_sized_batch_t; using batch4d = xsimd::make_sized_batch_t; using batch8c = xsimd::make_sized_batch_t, 8>; using batch4z = xsimd::make_sized_batch_t, 4>; using batch8i32 = xsimd::make_sized_batch_t; using batch8u32 = xsimd::make_sized_batch_t; #if XSIMD_WITH_SSE2 || XSIMD_WITH_NEON || XSIMD_WITH_NEON64 || XSIMD_WITH_SVE || (XSIMD_WITH_RVV && XSIMD_RVV_BITS == 128) CHECK_EQ(4, size_t(batch4f::size)); CHECK_EQ(4, size_t(batch4c::size)); CHECK_EQ(4, size_t(batch4i32::size)); CHECK_EQ(4, size_t(batch4u32::size)); CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same, batch4c::value_type>::value)); CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same::value)); #if XSIMD_WITH_SSE2 || XSIMD_WITH_NEON64 || XSIMD_WITH_SVE || XSIMD_WITH_RVV CHECK_EQ(2, size_t(batch2d::size)); CHECK_EQ(2, size_t(batch2z::size)); CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same, batch2z::value_type>::value)); #else CHECK_UNARY(bool(std::is_same::value)); #endif #endif #if !XSIMD_WITH_AVX && !XSIMD_WITH_FMA3 && !(XSIMD_WITH_SVE && XSIMD_SVE_BITS == 256) && !(XSIMD_WITH_RVV && XSIMD_RVV_BITS == 256) CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same::value)); #else CHECK_EQ(8, size_t(batch8f::size)); CHECK_EQ(8, size_t(batch8i32::size)); CHECK_EQ(8, size_t(batch8u32::size)); CHECK_EQ(4, size_t(batch4d::size)); CHECK_EQ(8, size_t(batch8c::size)); CHECK_EQ(4, size_t(batch4z::size)); CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same::value)); CHECK_UNARY(bool(std::is_same, batch8c::value_type>::value)); CHECK_UNARY(bool(std::is_same, batch4z::value_type>::value)); #endif } SUBCASE("xsimd::load_(un)aligned(...) return type") { // make sure load_aligned / load_unaligned work for the default arch and // return the appropriate type. try_loads #if XSIMD_WITH_NEON64 || !XSIMD_WITH_NEON , double, std::complex #endif >(); } } #endif xtensor-stack-xsimd-541558d/test/test_basic_math.cpp000066400000000000000000000133251517435117100225470ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/xsimd.hpp" #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include "test_utils.hpp" #ifndef __FAST_MATH__ namespace detail { template ::value> struct infinity_tester { static void test_isfinite() { T input(1); INFO("input: ", input); CHECK_UNARY(xsimd::all(xsimd::isfinite(input))); } static void test_isinf() { T input(1); INFO("input: ", input); CHECK_FALSE(xsimd::any(xsimd::isinf(input))); } }; template struct infinity_tester { static void test_isfinite() { T input = xsimd::infinity(); CHECK_FALSE(xsimd::any(xsimd::isfinite(input))); } static void test_isinf() { T input = xsimd::infinity(); CHECK_UNARY(xsimd::all(xsimd::isinf(input))); } }; } #endif template struct basic_math_test { using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; array_type lhs; array_type rhs; array_type clip_input; array_type from_input; basic_math_test() { for (size_t i = 0; i < size; ++i) { lhs[i] = value_type(i) / 4 + value_type(1.2) * std::sqrt(value_type(i + 0.25)) + value_type(1.); rhs[i] = value_type(10.2) / (i + 2) + value_type(0.25) + value_type(1.); clip_input[i] = static_cast(i) * value_type(0.25); from_input[i] = rhs[i] - value_type(1); } } void test_fmod() const { array_type expected; std::transform( lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::fmod(l, r); }); batch_type res = xsimd::fmod(batch_lhs(), batch_rhs()); CHECK_BATCH_EQ(res, expected); } void test_remainder() const { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::remainder(l, r); }); batch_type res = xsimd::remainder(batch_lhs(), batch_rhs()); CHECK_BATCH_EQ(res, expected); } void test_fdim() const { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::fdim(l, r); }); batch_type res = xsimd::fdim(batch_lhs(), batch_rhs()); CHECK_BATCH_EQ(res, expected); } void test_clip() { value_type clip_lo = static_cast(0.5); value_type clip_hi = static_cast(1.); array_type expected; std::transform(clip_input.cbegin(), clip_input.cend(), expected.begin(), [clip_lo, clip_hi](const value_type& l) { return l < clip_lo ? clip_lo : clip_hi < l ? clip_hi : l; }); batch_type res = xsimd::clip(batch_clip_input(), batch_type(clip_lo), batch_type(clip_hi)); CHECK_BATCH_EQ(res, expected); } #ifndef __FAST_MATH__ void test_isfinite() { detail::infinity_tester::test_isfinite(); } void test_isinf() { detail::infinity_tester::test_isinf(); } #endif void test_nextafter() { array_type expected; std::transform(from_input.cbegin(), from_input.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::nextafter(l, r); }); batch_type res = xsimd::nextafter(batch_from_input(), batch_rhs()); CHECK_BATCH_EQ(res, expected); } private: batch_type batch_lhs() const { return batch_type::load_unaligned(lhs.data()); } batch_type batch_rhs() const { return batch_type::load_unaligned(rhs.data()); } batch_type batch_clip_input() const { return batch_type::load_unaligned(clip_input.data()); } batch_type batch_from_input() const { return batch_type::load_unaligned(from_input.data()); } }; TEST_CASE_TEMPLATE("[basic math tests]", B, BATCH_MATH_TYPES) { basic_math_test Test; SUBCASE("fmod") { Test.test_fmod(); } SUBCASE("remainder") { Test.test_remainder(); } SUBCASE("fdim") { Test.test_fdim(); } SUBCASE("clip") { Test.test_clip(); } #ifndef __FAST_MATH__ SUBCASE("isfinite") { Test.test_isfinite(); } SUBCASE("isinf") { Test.test_isinf(); } #endif SUBCASE("nextafter") { Test.test_nextafter(); } } #endif xtensor-stack-xsimd-541558d/test/test_batch.cpp000066400000000000000000001102431517435117100215330ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/xsimd.hpp" #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include #include #include #include #include "test_utils.hpp" using namespace std::placeholders; template struct batch_test { using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; array_type lhs; array_type rhs; value_type scalar; batch_test() { init_operands(); } void test_stream_dump() const { array_type res; batch_type b = batch_type::load_unaligned(lhs.data()); b.store_unaligned(res.data()); std::ostringstream b_dump; b_dump << b; std::ostringstream res_dump; res_dump << '('; for (std::size_t i = 0; i < res.size() - 1; ++i) res_dump << res[i] << ", "; res_dump << res.back() << ')'; CHECK_EQ(res_dump.str(), b_dump.str()); } void test_load_store() const { array_type res; batch_type b = batch_type::load_unaligned(lhs.data()); b.store_unaligned(res.data()); INFO("load_unaligned / store_unaligned"); CHECK_EQ(res, lhs); alignas(xsimd::default_arch::alignment()) array_type arhs(this->rhs); alignas(xsimd::default_arch::alignment()) array_type ares; b = batch_type::load_aligned(arhs.data()); b.store_aligned(ares.data()); INFO("load_aligned / store_aligned"); CHECK_EQ(ares, rhs); } template struct pack { }; template void check_constructor_from_sequence(std::integral_constant, pack) const { array_type tmp = { static_cast(Values)... }; batch_type b0(static_cast(Values)...); INFO("batch(values...)"); CHECK_EQ(b0, tmp); batch_type b1 { static_cast(Values)... }; INFO("batch{values...}"); CHECK_EQ(b0, tmp); } template void check_constructor_from_sequence(std::integral_constant, pack) const { return check_constructor_from_sequence(std::integral_constant(), pack()); } void test_constructors() const { batch_type b; // value initialized to random data, can't be checked (void)b; array_type tmp; std::fill(tmp.begin(), tmp.end(), value_type(2)); batch_type b0a(2); INFO("batch(value_type)"); CHECK_EQ(b0a, tmp); batch_type b0b { 2 }; INFO("batch{value_type}"); CHECK_EQ(b0b, tmp); check_constructor_from_sequence(std::integral_constant(), pack<>()); } void test_static_builders() const { { array_type expected; std::fill(expected.begin(), expected.end(), value_type(2)); auto res = batch_type::broadcast(value_type(2)); INFO("batch::broadcast"); CHECK_EQ(res, expected); } { array_type res; auto b = batch_type::load_unaligned(lhs.data()); b.store_unaligned(res.data()); INFO("batch::load_unaligned"); CHECK_EQ(res, lhs); } { alignas(xsimd::default_arch::alignment()) array_type arhs(this->rhs); alignas(xsimd::default_arch::alignment()) array_type ares; auto b = batch_type::load_aligned(arhs.data()); b.store_aligned(ares.data()); INFO("batch::load_aligned"); CHECK_EQ(ares, rhs); } } void test_access_operator() const { batch_type res = batch_lhs(); for (size_t i = 0; i < size; ++i) { CHECK_EQ(res.get(i), lhs[i]); } } void test_first_element() const { batch_type res = batch_lhs(); CHECK_EQ(res.first(), lhs[0]); } template void test_get_impl(batch_type const& res, std::index_sequence) const { array_type extracted = { xsimd::get(res)... }; CHECK_EQ(extracted, lhs); CHECK_BATCH_EQ(batch_type::load_unaligned(extracted.data()), res); } void test_get() const { batch_type res = batch_lhs(); CHECK_EQ(xsimd::get<0>(res), res.first()); test_get_impl(res, std::make_index_sequence {}); } void test_arithmetic() const { // +batch { array_type expected = lhs; batch_type res = +batch_lhs(); INFO("+batch"); CHECK_BATCH_EQ(res, expected); } // -batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::negate()); batch_type res = -batch_lhs(); INFO("-batch"); CHECK_BATCH_EQ(res, expected); } // batch + batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::plus()); batch_type res = batch_lhs() + batch_rhs(); INFO("batch + batch"); CHECK_BATCH_EQ(res, expected); } // batch + scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, scalar)); batch_type lres = batch_lhs() + scalar; INFO("batch + scalar"); CHECK_BATCH_EQ(lres, expected); batch_type rres = scalar + batch_lhs(); INFO("scalar + batch"); CHECK_BATCH_EQ(rres, expected); } // batch - batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::minus()); batch_type res = batch_lhs() - batch_rhs(); INFO("batch - batch"); CHECK_BATCH_EQ(res, expected); } // batch - scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, scalar)); batch_type lres = batch_lhs() - scalar; INFO("batch - scalar"); CHECK_BATCH_EQ(lres, expected); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), scalar, _1)); batch_type rres = scalar - batch_lhs(); INFO("scalar - batch"); CHECK_BATCH_EQ(rres, expected); } // batch * batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::multiplies()); batch_type res = batch_lhs() * batch_rhs(); INFO("batch * batch"); CHECK_BATCH_EQ(res, expected); } // batch * scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, scalar)); batch_type lres = batch_lhs() * scalar; INFO("batch * scalar"); CHECK_BATCH_EQ(lres, expected); batch_type rres = scalar * batch_lhs(); INFO("scalar * batch"); CHECK_BATCH_EQ(rres, expected); } // batch / batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::divides()); batch_type res = batch_lhs() / batch_rhs(); INFO("batch / batch"); CHECK_BATCH_EQ(res, expected); } // batch / scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, scalar)); batch_type lres = batch_lhs() / scalar; INFO("batch / scalar"); CHECK_BATCH_EQ(lres, expected); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), scalar, _1)); batch_type rres = scalar / batch_lhs(); INFO("scalar / batch"); CHECK_BATCH_EQ(rres, expected); } } void test_incr_decr() const { // incr { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), xsimd::incr); batch_type res = xsimd::incr(batch_lhs()); INFO("incr(batch)"); CHECK_BATCH_EQ(res, expected); } // incr_if { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](value_type v) { return v > 1 ? v + 1 : v; }); batch_type res = xsimd::incr_if(batch_lhs(), batch_lhs() > value_type(1)); INFO("incr_if(batch)"); CHECK_BATCH_EQ(res, expected); } // decr { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), xsimd::decr); batch_type res = xsimd::decr(batch_lhs()); INFO("decr(batch)"); CHECK_BATCH_EQ(res, expected); } // decr_if { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](value_type v) { return v > 1 ? v - 1 : v; }); batch_type res = xsimd::decr_if(batch_lhs(), batch_lhs() > value_type(1)); INFO("decr_if(batch)"); CHECK_BATCH_EQ(res, expected); } } void test_saturated_arithmetic() const { // batch + batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), xsimd::sadd); batch_type res = xsimd::sadd(batch_lhs(), batch_rhs()); INFO("sadd(batch, batch)"); CHECK_BATCH_EQ(res, expected); } #if 0 // batch + scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](value_type x) { return xsimd::sadd(x, scalar); }); batch_type lres = xsimd::sadd(batch_lhs(), scalar); INFO("sadd(batch, scalar)"); CHECK_BATCH_EQ(lres, expected); batch_type rres = xsimd::sadd(scalar, batch_lhs()); INFO("sadd(scalar, batch)"); CHECK_BATCH_EQ(rres, expected); } #endif // batch - batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](value_type x, value_type y) { return xsimd::ssub(x, y); }); batch_type res = xsimd::ssub(batch_lhs(), batch_rhs()); INFO("ssub(batch, batch)"); CHECK_BATCH_EQ(res, expected); } #if 0 // batch - scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](value_type x) { return xsimd::ssub(x, scalar); }); batch_type lres = xsimd::ssub(batch_lhs(), scalar); INFO("ssub(batch, scalar)"); CHECK_BATCH_EQ(lres, expected); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](value_type x) { return xsimd::ssub(scalar, x); }); batch_type rres = xsimd::ssub(scalar, batch_lhs()); INFO("ssub(scalar, batch)"); CHECK_BATCH_EQ(rres, expected); } #endif } void test_computed_assignment() const { // batch += batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::plus()); batch_type res = batch_lhs(); res += batch_rhs(); INFO("batch += batch"); CHECK_BATCH_EQ(res, expected); } // batch += scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, scalar)); batch_type res = batch_lhs(); res += scalar; INFO("batch += scalar"); CHECK_BATCH_EQ(res, expected); } // batch -= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::minus()); batch_type res = batch_lhs(); res -= batch_rhs(); INFO("batch -= batch"); CHECK_BATCH_EQ(res, expected); } // batch -= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, scalar)); batch_type res = batch_lhs(); res -= scalar; INFO("batch -= scalar"); CHECK_BATCH_EQ(res, expected); } // batch *= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::multiplies()); batch_type res = batch_lhs(); res *= batch_rhs(); INFO("batch *= batch"); CHECK_BATCH_EQ(res, expected); } // batch *= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, scalar)); batch_type res = batch_lhs(); res *= scalar; INFO("batch *= scalar"); CHECK_BATCH_EQ(res, expected); } // batch /= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::divides()); batch_type res = batch_lhs(); res /= batch_rhs(); INFO("batch /= batch"); CHECK_BATCH_EQ(res, expected); } // batch /= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, scalar)); batch_type res = batch_lhs(); res /= scalar; INFO("batch /= scalar"); CHECK_BATCH_EQ(res, expected); } } void test_comparison() const { // batch == batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l == r; }); auto res = batch_lhs() == batch_rhs(); INFO("batch == batch"); CHECK_BATCH_EQ(res, expected); } // batch == scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l == scalar; }); auto res = batch_lhs() == scalar; INFO("batch == scalar"); CHECK_BATCH_EQ(res, expected); } // batch != batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l != r; }); auto res = batch_lhs() != batch_rhs(); INFO("batch != batch"); CHECK_BATCH_EQ(res, expected); } // batch != scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l != scalar; }); auto res = batch_lhs() != scalar; INFO("batch != scalar"); CHECK_BATCH_EQ(res, expected); } // batch < batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l < r; }); auto res = batch_lhs() < batch_rhs(); INFO("batch < batch"); CHECK_BATCH_EQ(res, expected); std::fill(expected.begin(), expected.end(), false); res = batch_lhs() < batch_lhs(); INFO("batch < (self)"); CHECK_BATCH_EQ(res, expected); } // batch < scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l < scalar; }); auto res = batch_lhs() < scalar; INFO("batch < scalar"); CHECK_BATCH_EQ(res, expected); auto res_neg = batch_lhs() >= scalar; INFO("batch >= scalar"); CHECK_BATCH_EQ(!res_neg, expected); } // batch <= batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l <= r; }); auto res = batch_lhs() <= batch_rhs(); INFO("batch <= batch"); CHECK_BATCH_EQ(res, expected); std::fill(expected.begin(), expected.end(), true); res = batch_lhs() <= batch_lhs(); INFO("batch < (self)"); CHECK_BATCH_EQ(res, expected); } // batch <= scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l <= scalar; }); auto res = batch_lhs() <= scalar; INFO("batch <= scalar"); CHECK_BATCH_EQ(res, expected); auto res_neg = batch_lhs() > scalar; INFO("batch > scalar"); CHECK_BATCH_EQ(!res_neg, expected); } // batch > batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l > r; }); auto res = batch_lhs() > batch_rhs(); INFO("batch > batch"); CHECK_BATCH_EQ(res, expected); std::fill(expected.begin(), expected.end(), false); res = batch_lhs() > batch_lhs(); INFO("batch > (self)"); CHECK_BATCH_EQ(res, expected); } // batch > scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l > scalar; }); auto res = batch_lhs() > scalar; INFO("batch > scalar"); CHECK_BATCH_EQ(res, expected); auto res_neg = batch_lhs() <= scalar; INFO("batch <= scalar"); CHECK_BATCH_EQ(!res_neg, expected); } // batch >= batch { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l >= r; }); auto res = batch_lhs() >= batch_rhs(); INFO("batch >= batch"); CHECK_BATCH_EQ(res, expected); std::fill(expected.begin(), expected.end(), true); res = batch_lhs() >= batch_lhs(); INFO("batch >= (self)"); CHECK_BATCH_EQ(res, expected); } // batch >= scalar { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [this](const value_type& l) { return l >= scalar; }); auto res = batch_lhs() >= scalar; INFO("batch >= scalar"); CHECK_BATCH_EQ(res, expected); auto res_neg = batch_lhs() < scalar; INFO("batch < scalar"); CHECK_BATCH_EQ(!res_neg, expected); } } void test_logical() const { // batch && batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::logical_and()); batch_type res = batch_lhs() && batch_rhs(); INFO("batch && batch"); CHECK_BATCH_EQ(res, expected); } // batch && scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::logical_and(), _1, scalar)); batch_type lres = batch_lhs() && scalar; INFO("batch && scalar"); CHECK_BATCH_EQ(lres, expected); batch_type rres = scalar && batch_lhs(); INFO("scalar && batch"); CHECK_BATCH_EQ(rres, expected); } // batch || batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::logical_or()); batch_type res = batch_lhs() || batch_rhs(); INFO("batch && batch"); CHECK_BATCH_EQ(res, expected); } // batch || scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::logical_or(), _1, scalar)); batch_type lres = batch_lhs() || scalar; INFO("batch || scalar"); CHECK_BATCH_EQ(lres, expected); batch_type rres = scalar || batch_lhs(); INFO("scalar || batch"); CHECK_BATCH_EQ(rres, expected); } } void test_min_max() const { // min { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::min(l, r); }); batch_type res = min(batch_lhs(), batch_rhs()); INFO("min"); CHECK_BATCH_EQ(res, expected); } // min limit case { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type&, const value_type& r) { return std::min(std::numeric_limits::min(), r); }); batch_type res = xsimd::min(batch_type(std::numeric_limits::min()), batch_rhs()); INFO("min limit"); CHECK_BATCH_EQ(res, expected); } // fmin { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::fmin(l, r); }); batch_type res = min(batch_lhs(), batch_rhs()); INFO("fmin"); CHECK_BATCH_EQ(res, expected); } // max { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::max(l, r); }); batch_type res = max(batch_lhs(), batch_rhs()); INFO("max"); CHECK_BATCH_EQ(res, expected); } // max limit case { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type&, const value_type& r) { return std::max(std::numeric_limits::max(), r); }); batch_type res = xsimd::max(batch_type(std::numeric_limits::max()), batch_rhs()); INFO("max limit"); CHECK_BATCH_EQ(res, expected); } // fmax { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return std::fmax(l, r); }); batch_type res = fmax(batch_lhs(), batch_rhs()); INFO("fmax"); CHECK_BATCH_EQ(res, expected); } } void test_fused_operations() const { // fma { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r + r; }); // Warning: ADL seems to not work correctly on Windows, thus the full qualified call batch_type res = xsimd::fma(batch_lhs(), batch_rhs(), batch_rhs()); INFO("fma"); CHECK_BATCH_EQ(res, expected); } // fms { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r - r; }); batch_type res = fms(batch_lhs(), batch_rhs(), batch_rhs()); INFO("fms"); CHECK_BATCH_EQ(res, expected); } // fnma { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return -l * r + r; }); batch_type res = fnma(batch_lhs(), batch_rhs(), batch_rhs()); INFO("fnma"); CHECK_BATCH_EQ(res, expected); } // fnms { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return -l * r - r; }); batch_type res = fnms(batch_lhs(), batch_rhs(), batch_rhs()); INFO("fnms"); CHECK_BATCH_EQ(res, expected); } // fmas { array_type expected; for (std::size_t i = 0; i < expected.size(); ++i) { // even lanes: x*y - z, odd lanes: x*y + z expected[i] = (i & 1u) == 0 ? lhs[i] * rhs[i] - rhs[i] : lhs[i] * rhs[i] + rhs[i]; } batch_type res = fmas(batch_lhs(), batch_rhs(), batch_rhs()); INFO("fmas"); CHECK_BATCH_EQ(res, expected); } } void test_abs() const { // abs { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return ::detail::uabs(l); }); batch_type res = abs(batch_lhs()); INFO("abs"); CHECK_BATCH_EQ(res, expected); } // fabs { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return std::fabs(l); }); batch_type res = fabs(batch_lhs()); INFO("fabs"); CHECK_BATCH_EQ(res, expected); } } void test_avg() const { { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) -> value_type { if (std::is_integral::value) { return static_cast(((long long)l + r) / 2); } else { return (l + r) / 2; } }); batch_type res = avg(batch_lhs(), batch_rhs()); INFO("avg"); CHECK_BATCH_EQ(res, expected); } { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) -> value_type { if (std::is_integral::value) { return static_cast(((long long)l + r) / 2 + ((long long)(l + r) & 1)); } else { return (l + r) / 2; } }); batch_type res = avgr(batch_lhs(), batch_rhs()); INFO("avgr"); CHECK_BATCH_EQ(res, expected); } } void test_horizontal_operations() const { // reduce_add { value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type(0)); value_type res = reduce_add(batch_lhs()); INFO("reduce_add"); CHECK_SCALAR_EQ(res, expected); } // reduce_max { value_type expected = *std::max_element(lhs.cbegin(), lhs.cend()); value_type res = reduce_max(batch_lhs()); INFO("reduce_max"); CHECK_SCALAR_EQ(res, expected); } // reduce_min { value_type expected = *std::min_element(lhs.cbegin(), lhs.cend()); value_type res = reduce_min(batch_lhs()); INFO("reduce_min"); CHECK_SCALAR_EQ(res, expected); } // reduce_mul { value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type(1), std::multiplies()); value_type res = reduce_mul(batch_lhs()); INFO("reduce_mul"); CHECK_SCALAR_EQ(res, expected); } } template std::enable_if_t<4 <= N> test_common_horizontal_operations(std::integral_constant) const { // reduce common { value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type(1), std::multiplies()); value_type res = reduce(xsimd::mul, batch_lhs()); INFO("common reduce"); CHECK_SCALAR_EQ(res, expected); } } void test_common_horizontal_operations(...) const { } void test_boolean_conversions() const { using batch_bool_type = typename batch_type::batch_bool_type; // batch = true { batch_bool_type tbt(true); batch_type expected = batch_type(value_type(1)); batch_type res = (batch_type)tbt; INFO("batch = true"); CHECK_BATCH_EQ(res, expected); } // batch = false { batch_bool_type fbt(false); batch_type expected = batch_type(value_type(0)); batch_type res = (batch_type)fbt; INFO("batch = false"); CHECK_BATCH_EQ(res, expected); } // !batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return !l; }); batch_type res = (batch_type)!batch_lhs(); INFO("!batch"); CHECK_BATCH_EQ(res, expected); } // bitwise_cast { batch_bool_type fbt(false); batch_type expected = batch_type(value_type(0)); batch_type res = bitwise_cast(fbt); INFO("bitwise_cast"); CHECK_BATCH_EQ(res, expected); } // bitwise not { batch_bool_type fbt(true); batch_type expected = batch_type(value_type(0)); batch_type res = ~bitwise_cast(fbt); INFO("~batch"); CHECK_BATCH_EQ(res, expected); } } private: batch_type batch_lhs() const { return batch_type::load_unaligned(lhs.data()); } batch_type batch_rhs() const { return batch_type::load_unaligned(rhs.data()); } void init_operands() { XSIMD_IF_CONSTEXPR(std::is_integral::value) { for (size_t i = 0; i < size; ++i) { bool negative_lhs = std::is_signed::value && (i % 2 == 1); lhs[i] = value_type(i) * (negative_lhs ? -3 : 3); if (lhs[i] == value_type(0)) { lhs[i] += value_type(1); } rhs[i] = value_type(i) + value_type(2); } scalar = value_type(3); } else { for (size_t i = 0; i < size; ++i) { lhs[i] = value_type(i) / 4 + value_type(1.2) * std::sqrt(value_type(i + 0.25)); if (lhs[i] == value_type(0)) { lhs[i] += value_type(0.1); } rhs[i] = value_type(10.2) / (i + 2) + value_type(0.25); } scalar = value_type(1.2); } } }; TEST_CASE_TEMPLATE("[batch]", B, BATCH_TYPES) { batch_test Test; SUBCASE("stream_dump") { Test.test_stream_dump(); } SUBCASE("load_store") { Test.test_load_store(); } SUBCASE("constructors") { Test.test_constructors(); } SUBCASE("static_builders") { Test.test_static_builders(); } SUBCASE("access_operator") { Test.test_access_operator(); } SUBCASE("first element") { Test.test_first_element(); } SUBCASE("get") { Test.test_get(); } SUBCASE("arithmetic") { Test.test_arithmetic(); } SUBCASE("incr decr") { Test.test_incr_decr(); } SUBCASE("saturated_arithmetic") { Test.test_saturated_arithmetic(); } SUBCASE("computed_assignment") { Test.test_computed_assignment(); } SUBCASE("comparison") { Test.test_comparison(); } SUBCASE("logical") { Test.test_logical(); } SUBCASE("min_max") { Test.test_min_max(); } SUBCASE("fused_operations") { Test.test_fused_operations(); } SUBCASE("abs") { Test.test_abs(); } SUBCASE("avg") { Test.test_avg(); } SUBCASE("horizontal_operations") { Test.test_horizontal_operations(); Test.test_common_horizontal_operations(std::integral_constant()); } SUBCASE("boolean_conversions") { Test.test_boolean_conversions(); } } #endif xtensor-stack-xsimd-541558d/test/test_batch_bool.cpp000066400000000000000000001040061517435117100225460ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/xsimd.hpp" #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include #include #include #include #include "test_utils.hpp" namespace xsimd { namespace test_detail { template struct ct_mask_arch { static constexpr bool supported() noexcept { return true; } static constexpr bool available() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 0; } static constexpr bool requires_alignment() noexcept { return false; } static constexpr char const* name() noexcept { return "ct_mask_arch"; } }; template struct ct_mask_register { std::array data {}; }; struct mask_all_false { static constexpr bool get(std::size_t, std::size_t) { return false; } }; struct mask_all_true { static constexpr bool get(std::size_t, std::size_t) { return true; } }; struct mask_prefix1 { static constexpr bool get(std::size_t i, std::size_t) { return i < 1; } }; struct mask_suffix1 { static constexpr bool get(std::size_t i, std::size_t n) { return i >= (n - 1); } }; struct mask_ends { static constexpr bool get(std::size_t i, std::size_t n) { return (i < 1) || (i >= (n - 1)); } }; struct mask_interleaved { static constexpr bool get(std::size_t i, std::size_t) { return (i % 2) == 0; } }; template struct alternating_numeric { static constexpr T get(std::size_t i, std::size_t) { return (i % 2) ? T(2) : T(1); } }; } namespace types { template struct simd_register> { using register_type = test_detail::ct_mask_register; register_type data; constexpr operator register_type() const noexcept { return data; } }; template struct has_simd_register> : std::true_type { }; } template struct get_bool_base { using vector_type = std::array; std::vector almost_all_false() { std::vector vectors; vectors.reserve(N); for (size_t i = 0; i < N; ++i) { vector_type v; v.fill(false); v[i] = true; vectors.push_back(std::move(v)); } return vectors; } std::vector almost_all_true() { auto vectors = almost_all_false(); flip(vectors); return vectors; } void flip(vector_type& vec) { std::transform(vec.begin(), vec.end(), vec.begin(), std::logical_not {}); } void flip(std::vector& vectors) { for (auto& vec : vectors) { flip(vec); } } }; template struct get_bool; template struct get_bool, 1> : public get_bool_base { using type = batch_bool; type all_true = type(true); type all_false = type(false); type half = { 0 }; type ihalf = { 1 }; type interspersed = { 0 }; }; template struct get_bool, 2> : public get_bool_base { using type = batch_bool; type all_true = type(true); type all_false = type(false); type half = { 0, 1 }; type ihalf = { 1, 0 }; type interspersed = { 0, 1 }; }; template struct get_bool, 4> : public get_bool_base { using type = batch_bool; type all_true = true; type all_false = false; type half = { 0, 0, 1, 1 }; type ihalf = { 1, 1, 0, 0 }; type interspersed = { 0, 1, 0, 1 }; }; template struct get_bool, 8> : public get_bool_base { using type = batch_bool; type all_true = true; type all_false = false; type half = { 0, 0, 0, 0, 1, 1, 1, 1 }; type ihalf = { 1, 1, 1, 1, 0, 0, 0, 0 }; type interspersed = { 0, 1, 0, 1, 0, 1, 0, 1 }; }; template struct get_bool, 16> : public get_bool_base { using type = batch_bool; type all_true = true; type all_false = false; type half = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }; type ihalf = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; type interspersed = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }; }; template struct get_bool, 32> : public get_bool_base { using type = batch_bool; type all_true = true; type all_false = false; type half = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; type ihalf = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; type interspersed = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }; }; template struct get_bool, 64> : public get_bool_base { using type = batch_bool; type all_true = true; type all_false = false; type half = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; type ihalf = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; type interspersed = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }; }; } template struct batch_bool_test { using batch_type = T; using value_type = typename T::value_type; static constexpr size_t size = T::size; using batch_bool_type = typename T::batch_bool_type; using array_type = std::array; using bool_array_type = std::array; // Compile-time check helpers for batch_bool_constant masks template struct xsimd_ct_mask_checker; // Small masks: safe to compare numeric masks at compile time template struct xsimd_ct_mask_checker> { static constexpr std::size_t sum_indices(uint64_t bits, std::size_t index, std::size_t remaining) { return remaining == 0 ? 0u : ((bits & 1u ? index : 0u) + sum_indices(bits >> 1, index + 1, remaining - 1)); } static constexpr uint32_t low_mask_bits(std::size_t width) { return width == 0 ? 0u : (static_cast(1u << width) - 1u); } template struct splice_checker { static void run() { } }; template struct splice_checker { static void run() { constexpr std::size_t begin = 1; constexpr std::size_t end = (Mask::size > 3 ? 3 : Mask::size); constexpr std::size_t length = (end > begin) ? (end - begin) : 0; using slice_arch = xsimd::test_detail::ct_mask_arch; constexpr auto slice = xsimd::detail::splice(Mask {}); constexpr uint32_t src_mask = static_cast(Mask::mask()); constexpr uint32_t expected = (src_mask >> begin) & low_mask_bits(length); static_assert(static_cast(slice.mask()) == expected, "splice mask expected"); constexpr uint32_t slice_bits = static_cast(slice.mask()); constexpr uint32_t shifted_source = src_mask >> begin; static_assert((length == 0) || ((slice_bits & 1u) == (shifted_source & 1u)), "slice first bit matches"); static_assert((length <= 1) || (((slice_bits >> (length - 1)) & 1u) == ((shifted_source >> (length - 1)) & 1u)), "slice last bit matches"); } }; template struct half_checker { static void run() { } }; template struct half_checker { static void run() { constexpr std::size_t total = Mask::size; constexpr std::size_t mid = total / 2; using lower_arch = xsimd::test_detail::ct_mask_arch; using upper_arch = xsimd::test_detail::ct_mask_arch; constexpr auto lower = xsimd::detail::lower_half(Mask {}); constexpr auto upper = xsimd::detail::upper_half(Mask {}); constexpr uint32_t source_mask = static_cast(Mask::mask()); static_assert(static_cast(lower.mask()) == (source_mask & low_mask_bits(mid)), "lower_half mask matches"); static_assert(static_cast(upper.mask()) == ((source_mask >> mid) & low_mask_bits(total - mid)), "upper_half mask matches"); constexpr auto lower_splice = xsimd::detail::splice(Mask {}); constexpr auto upper_splice = xsimd::detail::splice(Mask {}); static_assert(lower.mask() == lower_splice.mask(), "lower_half equals splice"); static_assert(upper.mask() == upper_splice.mask(), "upper_half equals splice"); constexpr uint32_t lower_bits = static_cast(lower.mask()); constexpr uint32_t upper_bits = static_cast(upper.mask()); constexpr std::size_t upper_size = decltype(upper)::size; static_assert((mid == 0) || ((lower_bits & 1u) == (source_mask & 1u)), "lower first element"); static_assert((mid <= 1) || (((lower_bits >> (mid - 1)) & 1u) == ((source_mask >> (mid - 1)) & 1u)), "lower last element"); static_assert((upper_size == 0) || ((upper_bits & 1u) == ((source_mask >> mid) & 1u)), "upper first element"); static_assert((upper_size <= 1) || (((upper_bits >> (upper_size - 1)) & 1u) == ((source_mask >> (total - 1)) & 1u)), "upper last element"); } }; static void run() { using value_type = typename B::value_type; using arch_type = typename B::arch_type; constexpr auto m_zero = xsimd::make_batch_bool_constant(); constexpr auto m_one = xsimd::make_batch_bool_constant(); constexpr auto m_prefix = xsimd::make_batch_bool_constant(); constexpr auto m_suffix = xsimd::make_batch_bool_constant(); constexpr auto m_ends = xsimd::make_batch_bool_constant(); constexpr auto m_interleaved = xsimd::make_batch_bool_constant(); static_assert((m_zero | m_one).mask() == m_one.mask(), "0|1 == 1"); static_assert((m_zero & m_one).mask() == m_zero.mask(), "0&1 == 0"); static_assert((m_zero ^ m_zero).mask() == m_zero.mask(), "0^0 == 0"); static_assert((m_one ^ m_one).mask() == m_zero.mask(), "1^1 == 0"); static_assert((!m_zero).mask() == m_one.mask(), "!0 == 1"); static_assert((~m_zero).mask() == m_one.mask(), "~0 == 1"); static_assert((!m_one).mask() == m_zero.mask(), "!1 == 0"); static_assert((~m_one).mask() == m_zero.mask(), "~1 == 0"); static_assert(((m_prefix && m_suffix).mask()) == (m_prefix & m_suffix).mask(), "&& consistent"); static_assert(((m_prefix || m_suffix).mask()) == (m_prefix | m_suffix).mask(), "|| consistent"); static_assert((m_prefix | m_suffix).mask() == m_ends.mask(), "prefix|suffix == ends"); static_assert(B::size == 1 || (m_prefix & m_suffix).mask() == m_zero.mask(), "prefix&suffix == 0 when size>1"); static_assert(m_zero.none(), "zero mask none"); static_assert(!m_zero.any(), "zero mask any"); static_assert(!m_zero.all(), "zero mask all"); static_assert(m_zero.countr_zero() == B::size, "zero mask trailing zeros"); static_assert(m_zero.countl_zero() == B::size, "zero mask leading zeros"); static_assert(m_one.all(), "all mask all"); static_assert(m_one.any(), "all mask any"); static_assert(!m_one.none(), "all mask none"); static_assert(m_one.countr_zero() == 0, "all mask trailing zeros"); static_assert(m_one.countl_zero() == 0, "all mask leading zeros"); constexpr auto prefix_bits = static_cast(m_prefix.mask()); constexpr auto suffix_bits = static_cast(m_suffix.mask()); constexpr auto ends_bits_mask = static_cast(m_ends.mask()); static_assert((B::size == 0) || ((prefix_bits & 1u) != 0u), "prefix first element set"); static_assert((B::size <= 1) || ((prefix_bits & (1u << 1)) == 0u), "prefix second element cleared"); static_assert((B::size == 0) || (((suffix_bits >> (B::size - 1)) & 1u) != 0u), "suffix last element set"); static_assert((B::size <= 1) || ((suffix_bits & 1u) == 0u), "suffix first element cleared"); static_assert((B::size == 0) || ((ends_bits_mask & 1u) != 0u), "ends first element set"); static_assert((B::size == 0) || (((ends_bits_mask >> (B::size - 1)) & 1u) != 0u), "ends last element set"); static_assert((B::size <= 2) || (((ends_bits_mask >> 1) & 1u) == 0u), "ends interior element cleared"); static_assert(std::is_same::value, "as_batch_bool type"); static_assert(std::is_same(m_prefix)), typename B::batch_bool_type>::value, "conversion operator type"); // splice API is validated indirectly via arch-specific masked implementations. constexpr std::size_t prefix_zero = m_prefix.countr_zero(); constexpr std::size_t prefix_one = m_prefix.countr_one(); static_assert(prefix_zero == 0, "prefix mask zero leading zeros from LSB"); static_assert((B::size == 0 ? prefix_one == 0 : prefix_one == 1), "prefix mask trailing ones count"); constexpr std::size_t suffix_zero = m_suffix.countl_zero(); constexpr std::size_t suffix_one = m_suffix.countl_one(); static_assert(suffix_zero == 0, "suffix mask leading zeros count"); static_assert((B::size == 0 ? suffix_one == 0 : suffix_one == 1), "suffix mask trailing ones count"); splice_checker 1)>::run(); half_checker 0 && (B::size % 2 == 0))>::run(); } }; // Large masks: avoid calling mask() in constant expressions template struct xsimd_ct_mask_checker 31)>> { static void run() { } }; array_type lhs; array_type rhs; bool_array_type all_true; bool_array_type ba; batch_bool_test() { for (size_t i = 0; i < size; ++i) { lhs[i] = value_type(i); rhs[i] = i == 0 % 2 ? lhs[i] : lhs[i] * value_type(2); all_true[i] = true; ba[i] = i == 0 % 2 ? true : false; } } template struct pack { }; template static batch_bool_type make_batch_impl(F&& f, std::integral_constant, pack) { return batch_bool_type(bool(f(Values))...); } template static batch_bool_type make_batch_impl(F&& f, std::integral_constant, pack) { return make_batch_impl(std::forward(f), std::integral_constant(), pack()); } template static batch_bool_type make_batch(F&& f) { return make_batch_impl(std::forward(f), std::integral_constant(), pack<> {}); } void test_constructors() const { batch_bool_type a; // value uninitialized, cannot test it. (void)a; { bool_array_type res; batch_bool_type b(true); b.store_unaligned(res.data()); INFO("batch_bool{value}"); CHECK_EQ(res, all_true); batch_bool_type c { true }; c.store_unaligned(res.data()); INFO("batch_bool{value}"); CHECK_EQ(res, all_true); } { auto f_bool = [](size_t i) { return bool(i % 3); }; bool_array_type res; for (size_t i = 0; i < res.size(); i++) { res[i] = f_bool(i); } bool_array_type tmp; batch_bool_type b0 = make_batch(f_bool); b0.store_unaligned(tmp.data()); INFO("batch_bool(values...)"); CHECK_EQ(tmp, res); batch_bool_type b1 = make_batch(f_bool); b1.store_unaligned(tmp.data()); INFO("batch_bool{values...}"); CHECK_EQ(tmp, res); } } void test_load_store() const { bool_array_type res; batch_bool_type b(batch_bool_type::load_unaligned(ba.data())); b.store_unaligned(res.data()); CHECK_EQ(res, ba); alignas(xsimd::default_arch::alignment()) bool_array_type arhs(this->ba); alignas(xsimd::default_arch::alignment()) bool_array_type ares; b = batch_bool_type::load_aligned(arhs.data()); b.store_aligned(ares.data()); CHECK_EQ(ares, arhs); auto bool_g = xsimd::get_bool {}; // load/store, almost all false { size_t i = 0; for (const auto& vec : bool_g.almost_all_false()) { batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); batch_bool_type expected = make_batch([i](size_t x) { return x == i; }); i++; CHECK_UNARY(xsimd::all(b == expected)); b.store_unaligned(res.data()); // Check that the representation is bitwise exact. CHECK_UNARY(memcmp(res.data(), vec.data(), sizeof(res)) == 0); } } // load/store, almost all true { size_t i = 0; for (const auto& vec : bool_g.almost_all_true()) { batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); batch_bool_type expected = make_batch([i](size_t x) { return x != i; }); i++; CHECK_UNARY(xsimd::all(b == expected)); b.store_unaligned(res.data()); CHECK_UNARY(memcmp(res.data(), vec.data(), sizeof(res)) == 0); } } } void test_any_all() const { auto bool_g = xsimd::get_bool {}; // any { auto any_check_false = (batch_lhs() != batch_lhs()); bool any_res_false = xsimd::any(any_check_false); CHECK_FALSE(any_res_false); auto any_check_true = (batch_lhs() == batch_rhs()); bool any_res_true = xsimd::any(any_check_true); CHECK_UNARY(any_res_true); for (const auto& vec : bool_g.almost_all_false()) { batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); bool any_res = xsimd::any(b); CHECK_UNARY(any_res); } for (const auto& vec : bool_g.almost_all_true()) { batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); bool any_res = xsimd::any(b); CHECK_UNARY(any_res); } } // all { auto all_check_false = (batch_lhs() == batch_rhs()); bool all_res_false = xsimd::all(all_check_false); CHECK_FALSE(all_res_false); auto all_check_true = (batch_lhs() == batch_lhs()); bool all_res_true = xsimd::all(all_check_true); CHECK_UNARY(all_res_true); for (const auto& vec : bool_g.almost_all_false()) { // TODO: implement batch_bool(bool*) // It currently compiles (need to understand why) but does not // give expected result batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); bool all_res = xsimd::all(b); CHECK_FALSE(all_res); } for (const auto& vec : bool_g.almost_all_true()) { batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); bool all_res = xsimd::all(b); CHECK_FALSE(all_res); } } // none { auto none_check_false = (batch_lhs() == batch_rhs()); bool none_res_false = xsimd::none(none_check_false); CHECK_FALSE(none_res_false); auto none_check_true = (batch_lhs() != batch_lhs()); bool none_res_true = xsimd::none(none_check_true); CHECK_UNARY(none_res_true); for (const auto& vec : bool_g.almost_all_false()) { batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); bool none_res = xsimd::none(b); CHECK_FALSE(none_res); } for (const auto& vec : bool_g.almost_all_true()) { batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); bool none_res = xsimd::none(b); CHECK_FALSE(none_res); } } } void test_logical_operations() const { auto bool_g = xsimd::get_bool {}; size_t s = size; // operator!= { bool res = xsimd::all(bool_g.half != bool_g.ihalf); CHECK_UNARY(res); } // operator== { CHECK_BATCH_EQ(bool_g.half, !bool_g.ihalf); } // operator && { batch_bool_type res = bool_g.half && bool_g.ihalf; bool_array_type ares; res.store_unaligned(ares.data()); size_t nb_false = std::count(ares.cbegin(), ares.cend(), false); CHECK_EQ(nb_false, s); } // operator || { batch_bool_type res = bool_g.half || bool_g.ihalf; bool_array_type ares; res.store_unaligned(ares.data()); size_t nb_true = std::count(ares.cbegin(), ares.cend(), true); CHECK_EQ(nb_true, s); } // operator ^ { batch_bool_type res = bool_g.half ^ bool_g.ihalf; bool_array_type ares; res.store_unaligned(ares.data()); size_t nb_true = std::count(ares.cbegin(), ares.cend(), true); CHECK_EQ(nb_true, s); } // bitwise_andnot { batch_bool_type res = xsimd::bitwise_andnot(bool_g.half, bool_g.half); bool_array_type ares; res.store_unaligned(ares.data()); size_t nb_false = std::count(ares.cbegin(), ares.cend(), false); CHECK_EQ(nb_false, s); } } void test_bitwise_operations() const { auto bool_g = xsimd::get_bool {}; // operator version { INFO("operator~"); CHECK_BATCH_EQ(bool_g.half, ~bool_g.ihalf); } { INFO("operator|"); CHECK_BATCH_EQ(bool_g.half | bool_g.ihalf, bool_g.all_true); } { INFO("operator&"); CHECK_BATCH_EQ(bool_g.half & bool_g.ihalf, bool_g.all_false); } { INFO("operator^"); CHECK_BATCH_EQ(bool_g.half ^ bool_g.all_true, bool_g.ihalf); } // free function version { INFO("bitwise_not"); CHECK_BATCH_EQ(bool_g.half, xsimd::bitwise_not(bool_g.ihalf)); } { INFO("bitwise_or"); CHECK_BATCH_EQ(xsimd::bitwise_or(bool_g.half, bool_g.ihalf), bool_g.all_true); } { INFO("bitwise_and"); CHECK_BATCH_EQ(xsimd::bitwise_and(bool_g.half, bool_g.ihalf), bool_g.all_false); } { INFO("bitwise_xor"); CHECK_BATCH_EQ(xsimd::bitwise_xor(bool_g.half, bool_g.all_true), bool_g.ihalf); } } void test_update_operations() const { auto bool_g = xsimd::get_bool {}; { auto tmp = bool_g.half; tmp |= bool_g.ihalf; bool res = xsimd::all(tmp); INFO("operator|="); CHECK_UNARY(res); } { auto tmp = bool_g.half; tmp &= bool_g.half; INFO("operator&="); CHECK_BATCH_EQ(tmp, bool_g.half); } { auto tmp = bool_g.half; tmp ^= bool_g.ihalf; bool res = xsimd::all(tmp); INFO("operator^="); CHECK_UNARY(res); } } void test_mask() const { auto bool_g = xsimd::get_bool {}; const uint64_t full_mask = ((uint64_t)-1) >> (64 - batch_bool_type::size); CHECK_EQ(bool_g.all_false.mask(), 0); CHECK_EQ(batch_bool_type::from_mask(bool_g.all_false.mask()).mask(), bool_g.all_false.mask()); CHECK_EQ(bool_g.all_true.mask(), full_mask); CHECK_EQ(batch_bool_type::from_mask(bool_g.all_true.mask()).mask(), bool_g.all_true.mask()); CHECK_EQ(bool_g.half.mask(), full_mask & ((uint64_t)-1) << (batch_bool_type::size / 2)); CHECK_EQ(batch_bool_type::from_mask(bool_g.half.mask()).mask(), bool_g.half.mask()); CHECK_EQ(bool_g.ihalf.mask(), full_mask & ~(((uint64_t)-1) << (batch_bool_type::size / 2))); CHECK_EQ(batch_bool_type::from_mask(bool_g.ihalf.mask()).mask(), bool_g.ihalf.mask()); CHECK_EQ(bool_g.interspersed.mask(), full_mask & 0xAAAAAAAAAAAAAAAAul); CHECK_EQ(batch_bool_type::from_mask(bool_g.interspersed.mask()).mask(), bool_g.interspersed.mask()); } void test_count() const { auto bool_g = xsimd::get_bool {}; CHECK_EQ(count(bool_g.all_false), 0); CHECK_EQ(count(bool_g.all_true), batch_bool_type::size); CHECK_EQ(count(bool_g.half), batch_bool_type::size / 2); } void test_count_lr() const { auto bool_g = xsimd::get_bool {}; { INFO("countl_zero"); CHECK_EQ(countl_zero(bool_g.all_false), batch_bool_type::size); CHECK_EQ(countl_zero(bool_g.all_true), 0); CHECK_EQ(countl_zero(bool_g.half), 0); CHECK_EQ(countl_zero(bool_g.ihalf), batch_bool_type::size / 2); } { INFO("countl_one"); CHECK_EQ(countl_one(bool_g.all_false), 0); CHECK_EQ(countl_one(bool_g.all_true), batch_bool_type::size); CHECK_EQ(countl_one(bool_g.half), batch_bool_type::size / 2); CHECK_EQ(countl_one(bool_g.ihalf), 0); } { INFO("countr_zero"); CHECK_EQ(countr_zero(bool_g.all_false), batch_bool_type::size); CHECK_EQ(countr_zero(bool_g.all_true), 0); CHECK_EQ(countr_zero(bool_g.half), batch_bool_type::size / 2); CHECK_EQ(countr_zero(bool_g.ihalf), 0); } { INFO("countr_one"); CHECK_EQ(countr_one(bool_g.all_false), 0); CHECK_EQ(countr_one(bool_g.all_true), batch_bool_type::size); CHECK_EQ(countr_one(bool_g.half), 0); CHECK_EQ(countr_one(bool_g.ihalf), batch_bool_type::size / 2); } { size_t i = 0; for (const auto& vec : bool_g.almost_all_false()) { batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); if (i == 0) { CHECK_EQ(countr_zero(b), 0); CHECK_EQ(countr_one(b), 1); } else { CHECK_EQ(countr_zero(b), i); CHECK_EQ(countr_one(b), 0); } if (i == batch_bool_type::size - 1) { CHECK_EQ(countl_zero(b), 0); CHECK_EQ(countl_one(b), 1); } else { CHECK_EQ(countl_zero(b), batch_bool_type::size - 1 - i); CHECK_EQ(countl_one(b), 0); } i++; } } { size_t i = 0; for (const auto& vec : bool_g.almost_all_true()) { batch_bool_type b = batch_bool_type::load_unaligned(vec.data()); if (i == 0) { CHECK_EQ(countr_zero(b), 1); CHECK_EQ(countr_one(b), 0); } else { CHECK_EQ(countr_zero(b), 0); CHECK_EQ(countr_one(b), i); } if (i == batch_bool_type::size - 1) { CHECK_EQ(countl_zero(b), 1); CHECK_EQ(countl_one(b), 0); } else { CHECK_EQ(countl_zero(b), 0); CHECK_EQ(countl_one(b), batch_bool_type::size - 1 - i); } i++; } } { INFO("interspersed pattern"); CHECK_EQ(countr_zero(bool_g.interspersed), 1); CHECK_EQ(countr_one(bool_g.interspersed), 0); if (batch_bool_type::size % 2 == 0) { CHECK_EQ(countl_zero(bool_g.interspersed), 0); CHECK_EQ(countl_one(bool_g.interspersed), 1); } else { CHECK_EQ(countl_zero(bool_g.interspersed), 1); CHECK_EQ(countl_one(bool_g.interspersed), 0); } } } void test_comparison() const { auto bool_g = xsimd::get_bool {}; // eq { CHECK_BATCH_EQ(bool_g.half, !bool_g.ihalf); CHECK_BATCH_EQ(xsimd::eq(bool_g.half, !bool_g.ihalf), bool_g.all_true); } // neq { CHECK_BATCH_EQ(xsimd::neq(bool_g.half, bool_g.ihalf), bool_g.all_true); CHECK_BATCH_EQ(xsimd::neq(bool_g.all_true, bool_g.all_true), bool_g.all_false); } } void test_mask_compile_time() const { xsimd_ct_mask_checker::run(); } private: batch_type batch_lhs() const { return batch_type::load_unaligned(lhs.data()); } batch_type batch_rhs() const { return batch_type::load_unaligned(rhs.data()); } }; TEST_CASE_TEMPLATE("[xsimd batch bool]", B, BATCH_TYPES) { batch_bool_test Test; SUBCASE("constructors") { Test.test_constructors(); } SUBCASE("load store") { Test.test_load_store(); } SUBCASE("any all") { Test.test_any_all(); } SUBCASE("logical operations") { Test.test_logical_operations(); } SUBCASE("bitwise operations") { Test.test_bitwise_operations(); } SUBCASE("update operations") { Test.test_update_operations(); } SUBCASE("mask") { Test.test_mask(); } SUBCASE("count") { Test.test_count(); } SUBCASE("count{l,r}_{zero,one}") { Test.test_count_lr(); } SUBCASE("eq neq") { Test.test_comparison(); } SUBCASE("mask utils (compile-time)") { Test.test_mask_compile_time(); } } #endif xtensor-stack-xsimd-541558d/test/test_batch_cast.cpp000066400000000000000000000433231517435117100225510ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/xsimd.hpp" #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include "test_utils.hpp" #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 namespace detail { template inline std::enable_if_t::value && std::is_integral::value, bool> is_convertible(T_in value) { return static_cast(value) <= static_cast(std::numeric_limits::max()); } template inline std::enable_if_t::value && std::is_signed::value && std::is_integral::value && std::is_signed::value, bool> is_convertible(T_in value) { int64_t signed_value = static_cast(value); return signed_value < static_cast(std::numeric_limits::max()) && signed_value >= static_cast(std::numeric_limits::lowest()); } template inline std::enable_if_t::value && std::is_signed::value && std::is_unsigned::value, bool> is_convertible(T_in value) { return value >= 0 && is_convertible(static_cast(value)); } template inline std::enable_if_t::value && std::is_integral::value, bool> is_convertible(T_in value) { return value < static_cast(std::numeric_limits::max()) && value >= static_cast(std::numeric_limits::lowest()); } template inline std::enable_if_t::value, bool> is_convertible(T_in) { return true; } template using uses_fast_cast = std::is_same, xsimd::kernel::detail::with_fast_conversion>; } template struct batch_cast_test { static constexpr size_t N = CP::size; static constexpr size_t A = CP::alignment; using int8_batch = xsimd::batch; using uint8_batch = xsimd::batch; using int16_batch = xsimd::batch; using uint16_batch = xsimd::batch; using int32_batch = xsimd::batch; using uint32_batch = xsimd::batch; using int64_batch = xsimd::batch; using uint64_batch = xsimd::batch; using float_batch = xsimd::batch; using double_batch = xsimd::batch; std::vector int_test_values; std::vector float_test_values; std::vector double_test_values; batch_cast_test() { int_test_values = { 0, 0x01, 0x7f, 0x80, 0xff, 0x0100, 0x7fff, 0x8000, 0xffff, 0x00010000, 0x7fffffff, 0x80000000, 0xffffffff, 0x0000000100000000, 0x7fffffffffffffff, 0x8000000000000000, 0xffffffffffffffff }; float_test_values = { 0.0f, 1.0f, -1.0f, 127.0f, 128.0f, -128.0f, 255.0f, 256.0f, -256.0f, 32767.0f, 32768.0f, -32768.0f, 65535.0f, 65536.0f, -65536.0f, 2147483647.0f, 2147483648.0f, -2147483648.0f, 4294967167.0f }; double_test_values = { 0.0, 1.0, -1.0, 127.0, 128.0, -128.0, 255.0, 256.0, -256.0, 32767.0, 32768.0, -32768.0, 65535.0, 65536.0, -65536.0, 2147483647.0, 2147483648.0, -2147483648.0, 4294967295.0, 4294967296.0, -4294967296.0, 9223372036854775807.0, 9223372036854775808.0, -9223372036854775808.0, 18446744073709550591.0 }; } void test_bool_cast() const { test_bool_cast_impl("batch bool cast float -> int32"); test_bool_cast_impl("batch bool cast float -> uint32"); test_bool_cast_impl("batch bool cast int32 -> float"); test_bool_cast_impl("batch bool cast uint32 -> float"); test_bool_cast_impl("batch bool cast float -> float"); } void test_cast() const { for (const auto& test_value : int_test_values) { test_cast_impl(test_value, "batch cast int8 -> int8"); test_cast_impl(test_value, "batch cast int8 -> uint8"); test_cast_impl(test_value, "batch cast uint8 -> int8"); test_cast_impl(test_value, "batch cast uint8 -> uint8"); test_cast_impl(test_value, "batch cast int16 -> int16"); test_cast_impl(test_value, "batch cast int16 -> uint16"); test_cast_impl(test_value, "batch cast uint16 -> int16"); test_cast_impl(test_value, "batch cast uint16 -> uint16"); test_cast_impl(test_value, "batch cast int32 -> int32"); test_cast_impl(test_value, "batch cast int32 -> uint32"); test_cast_impl(test_value, "batch cast int32 -> float"); test_cast_impl(test_value, "batch cast uint32 -> int32"); test_cast_impl(test_value, "batch cast uint32 -> uint32"); test_cast_impl(test_value, "batch cast uint32 -> float"); test_cast_impl(test_value, "batch cast int64 -> int64"); test_cast_impl(test_value, "batch cast int64 -> uint64"); test_cast_impl(test_value, "batch cast int64 -> double"); test_cast_impl(test_value, "batch cast uint64 -> int64"); test_cast_impl(test_value, "batch cast uint64 -> uint64"); test_cast_impl(test_value, "batch cast uint64 -> double"); } for (const auto& test_value : float_test_values) { test_cast_impl(test_value, "batch cast float -> int32"); test_cast_impl(test_value, "batch cast float -> uint32"); test_cast_impl(test_value, "batch cast float -> float"); } for (const auto& test_value : double_test_values) { test_cast_impl(test_value, "batch cast double -> int64"); test_cast_impl(test_value, "batch cast double -> uint64"); test_cast_impl(test_value, "batch cast double -> double"); } } #if 0 && XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION template std::enable_if_t:type test_cast_sizeshift1() const { for (const auto& test_value : int_test_values) { test_cast_impl(test_value, "batch cast int8 -> int16"); test_cast_impl(test_value, "batch cast int8 -> uint16"); test_cast_impl(test_value, "batch cast uint8 -> int16"); test_cast_impl(test_value, "batch cast uint8 -> uint16"); test_cast_impl(test_value, "batch cast int16 -> int8"); test_cast_impl(test_value, "batch cast int16 -> uint8"); test_cast_impl(test_value, "batch cast int16 -> int32"); test_cast_impl(test_value, "batch cast int16 -> uint32"); test_cast_impl(test_value, "batch cast int16 -> float"); test_cast_impl(test_value, "batch cast uint16 -> int8"); test_cast_impl(test_value, "batch cast uint16 -> uint8"); test_cast_impl(test_value, "batch cast uint16 -> int32"); test_cast_impl(test_value, "batch cast uint16 -> uint32"); test_cast_impl(test_value, "batch cast uint16 -> float"); test_cast_impl(test_value, "batch cast int32 -> int16"); test_cast_impl(test_value, "batch cast int32 -> uint16"); test_cast_impl(test_value, "batch cast int32 -> int64"); test_cast_impl(test_value, "batch cast int32 -> uint64"); test_cast_impl(test_value, "batch cast int32 -> double"); test_cast_impl(test_value, "batch cast uint32 -> int16"); test_cast_impl(test_value, "batch cast uint32 -> uint16"); test_cast_impl(test_value, "batch cast uint32 -> int64"); test_cast_impl(test_value, "batch cast uint32 -> uint64"); test_cast_impl(test_value, "batch cast uint32 -> double"); test_cast_impl(test_value, "batch cast int64 -> int32"); test_cast_impl(test_value, "batch cast int64 -> uint32"); test_cast_impl(test_value, "batch cast int64 -> float"); test_cast_impl(test_value, "batch cast uint64 -> int32"); test_cast_impl(test_value, "batch cast uint64 -> uint32"); test_cast_impl(test_value, "batch cast uint64 -> float"); } for (const auto& test_value : float_test_values) { test_cast_impl(test_value, "batch cast float -> int16"); test_cast_impl(test_value, "batch cast float -> uint16"); test_cast_impl(test_value, "batch cast float -> int64"); test_cast_impl(test_value, "batch cast float -> uint64"); test_cast_impl(test_value, "batch cast float -> double"); } for (const auto& test_value : double_test_values) { test_cast_impl(test_value, "batch cast double -> int32"); test_cast_impl(test_value, "batch cast double -> uint32"); test_cast_impl(test_value, "batch cast double -> float"); } } template std::enable_if_t::type test_cast_sizeshift1() const { } #endif #if 0 && XSIMD_X86_INSTR_SET > D_X86_AVX512_VERSION template std::enable_if_t:type test_cast_sizeshift2() const { for (const auto& test_value : int_test_values) { test_cast_impl(test_value, "batch cast int8 -> int32"); test_cast_impl(test_value, "batch cast int8 -> uint32"); test_cast_impl(test_value, "batch cast int8 -> float"); test_cast_impl(test_value, "batch cast uint8 -> int32"); test_cast_impl(test_value, "batch cast uint8 -> uint32"); test_cast_impl(test_value, "batch cast uint8 -> float"); test_cast_impl(test_value, "batch cast int16 -> int64"); test_cast_impl(test_value, "batch cast int16 -> uint64"); test_cast_impl(test_value, "batch cast int16 -> double"); test_cast_impl(test_value, "batch cast uint16 -> int64"); test_cast_impl(test_value, "batch cast uint16 -> uint64"); test_cast_impl(test_value, "batch cast uint16 -> double"); test_cast_impl(test_value, "batch cast int32 -> int8"); test_cast_impl(test_value, "batch cast int32 -> uint8"); test_cast_impl(test_value, "batch cast uint32 -> int8"); test_cast_impl(test_value, "batch cast uint32 -> uint8"); test_cast_impl(test_value, "batch cast int64 -> int16"); test_cast_impl(test_value, "batch cast int64 -> uint16"); test_cast_impl(test_value, "batch cast uint64 -> int16"); test_cast_impl(test_value, "batch cast uint64 -> uint16"); } for (const auto& test_value : float_test_values) { test_cast_impl(test_value, "batch cast float -> int8"); test_cast_impl(test_value, "batch cast float -> uint8"); } for (const auto& test_value : double_test_values) { test_cast_impl(test_value, "batch cast double -> int16"); test_cast_impl(test_value, "batch cast double -> uint16"); } } template std::enable_if_t::type test_cast_sizeshift2() const { } #endif private: template void test_cast_impl(T test_value, const std::string& name) const { using T_in = typename B_in::value_type; using T_out = typename B_out::value_type; using B_common_in = xsimd::batch; using B_common_out = xsimd::batch; auto clamp = [](T v) { return static_cast(xsimd::min(v, static_cast(std::numeric_limits::max() - 1))); }; T_in in_test_value = clamp(test_value); if (detail::is_convertible(in_test_value)) { B_common_out res = xsimd::batch_cast(B_common_in(in_test_value)); INFO(name); T_out scalar_ref = static_cast(in_test_value); T_out scalar_res = res.get(0); CHECK_SCALAR_EQ(scalar_ref, scalar_res); CHECK_SCALAR_EQ(scalar_ref, xsimd::batch_cast(in_test_value)); } } template void test_bool_cast_impl(const std::string& name) const { using T_in = typename B_in::value_type; using T_out = typename B_out::value_type; using B_common_in = xsimd::batch_bool; using B_common_out = xsimd::batch_bool; B_common_in all_true_in(true); B_common_out all_true_res = xsimd::batch_bool_cast(all_true_in); INFO(name); CHECK_SCALAR_EQ(all_true_res.get(0), true); CHECK_SCALAR_EQ(xsimd::batch_bool_cast(true), true); B_common_in all_false_in(false); B_common_out all_false_res = xsimd::batch_bool_cast(all_false_in); INFO(name); CHECK_SCALAR_EQ(all_false_res.get(0), false); CHECK_SCALAR_EQ(xsimd::batch_bool_cast(false), false); } }; TEST_CASE_TEMPLATE("[xsimd cast tests]", B, CONVERSION_TYPES) { batch_cast_test Test; SUBCASE("bool cast") { Test.test_bool_cast(); } SUBCASE("cast") { Test.test_cast(); } } #endif #if 0 && XSIMD_X86_INSTR_SET > D_X86_AVX_VERSION TYPED_TEST(batch_cast_test, cast_sizeshift1) { this->test_cast_sizeshift1(); } #endif #if 0 && XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX512_VERSION TYPED_TEST(batch_cast_test, cast_sizeshift2) { this->test_cast_sizeshift2(); } #endif #if XSIMD_WITH_SSE2 TEST_CASE_TEMPLATE("[xsimd cast tests]", B, CONVERSION_TYPES) { SUBCASE("use fastcast") { using A = xsimd::default_arch; static_assert(detail::uses_fast_cast::value, "expected int32 to float conversion to use fast_cast"); static_assert(detail::uses_fast_cast::value, "expected float to int32 conversion to use fast_cast"); } } #endif #endif xtensor-stack-xsimd-541558d/test/test_batch_complex.cpp000066400000000000000000000656061517435117100232760ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/xsimd.hpp" #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include #include #include #include "test_utils.hpp" using namespace std::placeholders; template struct batch_complex_test { using batch_type = xsimd::simd_type; using arch_type = typename B::arch_type; using real_batch_type = typename B::real_batch; using value_type = typename B::value_type; using real_value_type = typename value_type::value_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; using real_array_type = std::array; array_type lhs; array_type rhs; value_type scalar; real_value_type real_scalar; #ifdef XSIMD_ENABLE_XTL_COMPLEX using xtl_value_type = xtl::xcomplex; using xtl_array_type = std::array; #endif batch_complex_test() { scalar = value_type(real_value_type(1.4), real_value_type(2.3)); real_scalar = scalar.real(); for (size_t i = 0; i < size; ++i) { lhs[i] = value_type(real_value_type(i) / real_value_type(4) + real_value_type(1.2) * std::sqrt(real_value_type(i + 0.25)), real_value_type(i) / real_value_type(5)); rhs[i] = value_type(real_value_type(10.2) / real_value_type(i + 2) + real_value_type(0.25), real_value_type(i) / real_value_type(3.2)); } } void test_load_store() const { { array_type res; batch_type b = batch_type::load_unaligned(lhs.data()); b.store_unaligned(res.data()); CHECK_EQ(res, lhs); alignas(arch_type::alignment()) array_type arhs(this->rhs); alignas(arch_type::alignment()) array_type ares; b = batch_type::load_aligned(arhs.data()); b.store_aligned(ares.data()); CHECK_EQ(ares, rhs); } { real_array_type real, imag, res_real, res_imag; for (size_t i = 0; i < size; ++i) { real[i] = lhs[i].real(); imag[i] = lhs[i].imag(); } batch_type b = batch_type::load_unaligned(real.data(), imag.data()); b.store_unaligned(res_real.data(), res_imag.data()); CHECK_EQ(res_real, real); alignas(arch_type::alignment()) real_array_type areal, aimag, ares_real, ares_imag; for (size_t i = 0; i < size; ++i) { areal[i] = lhs[i].real(); aimag[i] = lhs[i].imag(); } b = batch_type::load_aligned(areal.data(), aimag.data()); b.store_aligned(ares_real.data(), ares_imag.data()); CHECK_EQ(ares_real, areal); } { real_array_type real, imag, res_real, res_imag; for (size_t i = 0; i < size; ++i) { real[i] = lhs[i].real(); imag[i] = 0; } batch_type b = batch_type::load_unaligned(real.data()); b.store_unaligned(res_real.data(), res_imag.data()); CHECK_EQ(res_real, real); CHECK_EQ(res_imag, imag); alignas(arch_type::alignment()) real_array_type areal, aimag, ares_real, ares_imag; for (size_t i = 0; i < size; ++i) { areal[i] = lhs[i].real(); aimag[i] = 0; } b = batch_type::load_aligned(areal.data()); b.store_aligned(ares_real.data(), ares_imag.data()); CHECK_EQ(ares_real, areal); CHECK_EQ(ares_imag, aimag); } } #ifdef XSIMD_ENABLE_XTL_COMPLEX void test_load_store_xtl() const { xtl_array_type tmp; std::fill(tmp.begin(), tmp.end(), xtl_value_type(2, 3)); alignas(arch_type::alignment()) xtl_array_type aligned_tmp; std::fill(aligned_tmp.begin(), aligned_tmp.end(), xtl_value_type(2, 3)); batch_type b0(xtl_value_type(2, 3)); CHECK_EQ(b0, tmp); batch_type b1 = xsimd::load_as(aligned_tmp.data(), xsimd::aligned_mode()); CHECK_EQ(b1, tmp); batch_type b2 = xsimd::load_as(tmp.data(), xsimd::unaligned_mode()); CHECK_EQ(b2, tmp); xsimd::store_as(aligned_tmp.data(), b1, xsimd::aligned_mode()); CHECK_EQ(b1, aligned_tmp); xsimd::store_as(tmp.data(), b2, xsimd::unaligned_mode()); CHECK_EQ(b2, tmp); } #endif void test_constructors() const { array_type tmp; std::fill(tmp.begin(), tmp.end(), value_type(2, 3)); batch_type b0a(value_type(2, 3)); CHECK_EQ(b0a, tmp); batch_type b0b = batch_type::broadcast(value_type(2, 3)); CHECK_EQ(b0b, tmp); batch_type b0c = xsimd::broadcast(value_type(2, 3)); CHECK_EQ(b0c, tmp); std::fill(tmp.begin(), tmp.end(), value_type(real_scalar)); batch_type b1(real_scalar); CHECK_EQ(b1, tmp); real_array_type real, imag; for (size_t i = 0; i < size; ++i) { real[i] = lhs[i].real(); imag[i] = lhs[i].imag(); tmp[i] = value_type(real[i]); } } void test_access_operator() const { batch_type res = batch_lhs(); for (size_t i = 0; i < size; ++i) { CHECK_EQ(res.get(i), lhs[i]); } } void test_first_element() const { batch_type res = batch_lhs(); CHECK_EQ(res.first(), lhs[0]); } template void test_get_impl(batch_type const& res, std::index_sequence) const { array_type extracted = { xsimd::get(res)... }; CHECK_EQ(extracted, lhs); CHECK_BATCH_EQ(batch_type::load_unaligned(extracted.data()), res); } void test_get() const { batch_type res = batch_lhs(); CHECK_EQ(xsimd::get<0>(res), res.first()); test_get_impl(res, std::make_index_sequence {}); } void test_arithmetic() const { // +batch { array_type expected = lhs; batch_type res = +batch_lhs(); CHECK_BATCH_EQ(res, expected); } // -batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::negate()); batch_type res = -batch_lhs(); CHECK_BATCH_EQ(res, expected); } // batch + batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::plus()); batch_type res = batch_lhs() + batch_rhs(); CHECK_BATCH_EQ(res, expected); } // batch + scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, scalar)); batch_type lres = batch_lhs() + scalar; CHECK_BATCH_EQ(lres, expected); batch_type rres = scalar + batch_lhs(); CHECK_BATCH_EQ(rres, expected); } // batch + real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l + r.real(); }); batch_type lres = batch_lhs() + batch_rhs().real(); CHECK_BATCH_EQ(lres, expected); batch_type rres = batch_rhs().real() + batch_lhs(); CHECK_BATCH_EQ(rres, expected); } // batch + real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, real_scalar)); batch_type lres = batch_lhs() + real_scalar; CHECK_BATCH_EQ(lres, expected); batch_type rres = real_scalar + batch_lhs(); CHECK_BATCH_EQ(rres, expected); } // batch - batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::minus()); batch_type res = batch_lhs() - batch_rhs(); CHECK_BATCH_EQ(res, expected); } // batch - scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, scalar)); batch_type lres = batch_lhs() - scalar; CHECK_BATCH_EQ(lres, expected); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), scalar, _1)); batch_type rres = scalar - batch_lhs(); CHECK_BATCH_EQ(rres, expected); } // batch - real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l - r.real(); }); batch_type lres = batch_lhs() - batch_rhs().real(); CHECK_BATCH_EQ(lres, expected); std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return r.real() - l; }); batch_type rres = batch_rhs().real() - batch_lhs(); CHECK_BATCH_EQ(rres, expected); } // batch - real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, real_scalar)); batch_type lres = batch_lhs() - real_scalar; CHECK_BATCH_EQ(lres, expected); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), real_scalar, _1)); batch_type rres = real_scalar - batch_lhs(); CHECK_BATCH_EQ(rres, expected); } // batch * batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::multiplies()); batch_type res = batch_lhs() * batch_rhs(); CHECK_BATCH_EQ(res, expected); } // batch * scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, scalar)); batch_type lres = batch_lhs() * scalar; CHECK_BATCH_EQ(lres, expected); batch_type rres = scalar * batch_lhs(); CHECK_BATCH_EQ(rres, expected); } // batch * real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r.real(); }); batch_type lres = batch_lhs() * batch_rhs().real(); CHECK_BATCH_EQ(lres, expected); batch_type rres = batch_rhs().real() * batch_lhs(); CHECK_BATCH_EQ(rres, expected); } // batch * real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, real_scalar)); batch_type lres = batch_lhs() * real_scalar; CHECK_BATCH_EQ(lres, expected); batch_type rres = real_scalar * batch_lhs(); CHECK_BATCH_EQ(rres, expected); } // batch / batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::divides()); batch_type res = batch_lhs() / batch_rhs(); CHECK_BATCH_EQ(res, expected); } // batch / scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, scalar)); batch_type lres = batch_lhs() / scalar; CHECK_BATCH_EQ(lres, expected); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), scalar, _1)); batch_type rres = scalar / batch_lhs(); CHECK_BATCH_EQ(rres, expected); } // batch / real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l / r.real(); }); batch_type lres = batch_lhs() / batch_rhs().real(); CHECK_BATCH_EQ(lres, expected); std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return r.real() / l; }); batch_type rres = batch_rhs().real() / batch_lhs(); CHECK_BATCH_EQ(rres, expected); } // batch - real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, real_scalar)); batch_type lres = batch_lhs() / real_scalar; CHECK_BATCH_EQ(lres, expected); std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), real_scalar, _1)); batch_type rres = real_scalar / batch_lhs(); CHECK_BATCH_EQ(rres, expected); } } void test_computed_assignment() const { // batch += batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::plus()); batch_type res = batch_lhs(); res += batch_rhs(); CHECK_BATCH_EQ(res, expected); } // batch += scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, scalar)); batch_type res = batch_lhs(); res += scalar; CHECK_BATCH_EQ(res, expected); } // batch += real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l + r.real(); }); batch_type res = batch_lhs(); res += batch_rhs().real(); CHECK_BATCH_EQ(res, expected); } // batch += real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::plus(), _1, real_scalar)); batch_type res = batch_lhs(); res += real_scalar; CHECK_BATCH_EQ(res, expected); } // batch -= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::minus()); batch_type res = batch_lhs(); res -= batch_rhs(); CHECK_BATCH_EQ(res, expected); } // batch -= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, scalar)); batch_type res = batch_lhs(); res -= scalar; CHECK_BATCH_EQ(res, expected); } // batch -= real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l - r.real(); }); batch_type res = batch_lhs(); res -= batch_rhs().real(); CHECK_BATCH_EQ(res, expected); } // batch -= real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::minus(), _1, real_scalar)); batch_type res = batch_lhs(); res -= real_scalar; CHECK_BATCH_EQ(res, expected); } // batch *= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::multiplies()); batch_type res = batch_lhs(); res *= batch_rhs(); CHECK_BATCH_EQ(res, expected); } // batch *= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, scalar)); batch_type res = batch_lhs(); res *= scalar; CHECK_BATCH_EQ(res, expected); } // batch *= real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r.real(); }); batch_type res = batch_lhs(); res *= batch_rhs().real(); CHECK_BATCH_EQ(res, expected); } // batch *= real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::multiplies(), _1, real_scalar)); batch_type res = batch_lhs(); res *= real_scalar; CHECK_BATCH_EQ(res, expected); } // batch /= batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), std::divides()); batch_type res = batch_lhs(); res /= batch_rhs(); CHECK_BATCH_EQ(res, expected); } // batch /= scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, scalar)); batch_type res = batch_lhs(); res /= scalar; CHECK_BATCH_EQ(res, expected); } // batch /= real_batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l / r.real(); }); batch_type res = batch_lhs(); res /= batch_rhs().real(); CHECK_BATCH_EQ(res, expected); } // batch /= real_scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), std::bind(std::divides(), _1, real_scalar)); batch_type res = batch_lhs(); res /= real_scalar; CHECK_BATCH_EQ(res, expected); } } void test_conj_norm_proj() const { // conj { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& v) { using std::conj; return conj(v); }); batch_type res = conj(batch_lhs()); CHECK_BATCH_EQ(res, expected); } // norm { real_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& v) { using std::norm; return norm(v); }); real_batch_type res = norm(batch_lhs()); CHECK_BATCH_EQ(res, expected); } // proj { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& v) { using std::proj; return proj(v); }); batch_type res = proj(batch_lhs()); CHECK_BATCH_EQ(res, expected); } } void test_conj_norm_proj_real() const { // conj real batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& v) { return std::conj(std::real(v)); }); batch_type res = conj(real(batch_lhs())); CHECK_BATCH_EQ(res, expected); } // norm real batch { real_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& v) { return std::norm(std::real(v)); }); real_batch_type res = norm(real(batch_lhs())); CHECK_BATCH_EQ(res, expected); } // proj real batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& v) { return std::proj(std::real(v)); }); batch_type res = proj(real(batch_lhs())); CHECK_BATCH_EQ(res, expected); } } void test_polar() const { // polar w/ magnitude/phase { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& v_lhs, const value_type& v_rhs) { return std::polar(std::real(v_lhs), std::real(v_rhs)); }); batch_type res = polar(real(batch_lhs()), real(batch_rhs())); CHECK_BATCH_EQ(res, expected); } } void test_horizontal_operations() const { // reduce_add { value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type(0)); value_type res = reduce_add(batch_lhs()); CHECK_SCALAR_EQ(res, expected); } // reduce_mul { value_type expected = std::accumulate(lhs.cbegin(), lhs.cend(), value_type(1), std::multiplies()); value_type res = reduce_mul(batch_lhs()); CHECK_SCALAR_EQ(res, expected); } } void test_fused_operations() const { // fma { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r + r; }); batch_type res = xsimd::fma(batch_lhs(), batch_rhs(), batch_rhs()); CHECK_BATCH_EQ(res, expected); } // fms { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return l * r - r; }); batch_type res = fms(batch_lhs(), batch_rhs(), batch_rhs()); CHECK_BATCH_EQ(res, expected); } // fnma { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return -l * r + r; }); batch_type res = fnma(batch_lhs(), batch_rhs(), batch_rhs()); CHECK_BATCH_EQ(res, expected); } // fnms { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.begin(), expected.begin(), [](const value_type& l, const value_type& r) { return -l * r - r; }); batch_type res = fnms(batch_lhs(), batch_rhs(), batch_rhs()); CHECK_BATCH_EQ(res, expected); } } void test_boolean_conversion() const { // !batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return l == value_type(0); }); batch_type res = (batch_type)!batch_lhs(); CHECK_BATCH_EQ(res, expected); } } #ifndef __FAST_MATH__ void test_isnan() const { { bool_array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return std::isnan(l.real()) || std::isnan(l.imag()); }); typename batch_type::batch_bool_type res = isnan(batch_lhs()); CHECK_BATCH_EQ(res, expected); } } #endif private: batch_type batch_lhs() const { batch_type res = batch_type::load_unaligned(lhs.data()); return res; } batch_type batch_rhs() const { batch_type res = batch_type::load_unaligned(rhs.data()); return res; } }; TEST_CASE_TEMPLATE("[xsimd complex batches]", B, BATCH_COMPLEX_TYPES) { batch_complex_test Test; SUBCASE("load_store") { Test.test_load_store(); } #ifdef XSIMD_ENABLE_XTL_COMPLEX SUBCASE("load_store_xtl") { Test.test_load_store_xtl(); } #endif SUBCASE("constructors") { Test.test_constructors(); } SUBCASE("access_operator") { Test.test_access_operator(); } SUBCASE("first element") { Test.test_first_element(); } SUBCASE("get") { Test.test_get(); } SUBCASE("arithmetic") { Test.test_arithmetic(); } SUBCASE("computed_assignment") { Test.test_computed_assignment(); } SUBCASE("conj_norm_proj") { Test.test_conj_norm_proj(); } SUBCASE("conj_norm_proj_real") { Test.test_conj_norm_proj_real(); } SUBCASE("polar") { Test.test_polar(); } SUBCASE("horizontal_operations") { Test.test_horizontal_operations(); } SUBCASE("fused_operations") { Test.test_fused_operations(); } SUBCASE("boolean_conversion") { Test.test_boolean_conversion(); } #ifndef __FAST_MATH__ SUBCASE("isnan") { Test.test_isnan(); } #endif } #endif xtensor-stack-xsimd-541558d/test/test_batch_constant.cpp000066400000000000000000000404511517435117100234470ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include #include "xsimd/xsimd.hpp" #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include "test_utils.hpp" template struct constant_batch_test { using batch_type = B; using value_type = typename B::value_type; using arch_type = typename B::arch_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; using batch_bool_type = typename batch_type::batch_bool_type; struct generator { static constexpr value_type get(size_t index, size_t /*size*/) { return index % 2 ? 0 : 1; } }; void test_init_from_constant() const { array_type expected; std::generate(expected.begin(), expected.end(), []() { return 1; }); constexpr auto b = xsimd::make_batch_constant(); INFO("batch(value_type)"); CHECK_BATCH_EQ((batch_type)b, expected); } void test_init_from_array() const { #if __cplusplus >= 202002L constexpr array_type expected = []() { array_type out = {}; std::iota(out.begin(), out.end(), 0); return out; }(); constexpr auto b = xsimd::make_batch_constant(); INFO("batch(value_type)"); CHECK_BATCH_EQ((batch_type)b, expected); #endif } void test_init_from_generator() const { array_type expected; size_t i = 0; std::generate(expected.begin(), expected.end(), [&i]() { return generator::get(i++, size); }); constexpr auto b = xsimd::make_batch_constant(); INFO("batch(value_type)"); CHECK_BATCH_EQ((batch_type)b, expected); } void test_cast() const { constexpr auto cst_b = xsimd::make_batch_constant(); auto b0 = cst_b.as_batch(); auto b1 = (batch_type)cst_b; CHECK_BATCH_EQ(b0, b1); // The actual values are already tested in test_init_from_generator } struct arange { static constexpr value_type get(size_t index, size_t /*size*/) { return static_cast(index); } }; void test_init_from_generator_arange() const { array_type expected; size_t i = 0; std::generate(expected.begin(), expected.end(), [&i]() { return arange::get(i++, size); }); constexpr auto b = xsimd::make_batch_constant(); INFO("batch(value_type)"); CHECK_BATCH_EQ((batch_type)b, expected); constexpr auto b_p = xsimd::make_iota_batch_constant(); INFO("batch(value_type)"); CHECK_BATCH_EQ((batch_type)b_p, expected); } template struct constant { static constexpr value_type get(size_t /*index*/, size_t /*size*/) { return V; } }; void test_init_from_constant_generator() const { array_type expected; std::fill(expected.begin(), expected.end(), constant<3>::get(0, 0)); constexpr auto b = xsimd::make_batch_constant, arch_type>(); INFO("batch(value_type)"); CHECK_BATCH_EQ((batch_type)b, expected); } void test_ops() const { constexpr auto n12 = xsimd::make_batch_constant, arch_type>(); constexpr auto n3 = xsimd::make_batch_constant, arch_type>(); constexpr std::integral_constant c3; constexpr auto n12_add_n3 = n12 + n3; constexpr auto n15 = xsimd::make_batch_constant, arch_type>(); static_assert(std::is_same::value, "n12 + n3 == n15"); constexpr auto n12_add_c3 = n12 + c3; static_assert(std::is_same::value, "n12 + c3 == n15"); constexpr auto n12_sub_n3 = n12 - n3; constexpr auto n9 = xsimd::make_batch_constant, arch_type>(); static_assert(std::is_same::value, "n12 - n3 == n9"); constexpr auto n12_sub_c3 = n12 - c3; static_assert(std::is_same::value, "n12 - c3 == n9"); constexpr auto n12_mul_n3 = n12 * n3; constexpr auto n36 = xsimd::make_batch_constant, arch_type>(); static_assert(std::is_same::value, "n12 * n3 == n36"); constexpr auto n12_mul_c3 = n12 * c3; static_assert(std::is_same::value, "n12 - c3 == n36"); constexpr auto n12_div_n3 = n12 / n3; constexpr auto n4 = xsimd::make_batch_constant, arch_type>(); static_assert(std::is_same::value, "n12 / n3 == n4"); constexpr auto n12_div_c3 = n12 / c3; static_assert(std::is_same::value, "n12 / c3 == n4"); constexpr auto n12_mod_n3 = n12 % n3; constexpr auto n0 = xsimd::make_batch_constant, arch_type>(); static_assert(std::is_same::value, "n12 % n3 == n0"); constexpr auto n12_mod_c3 = n12 % c3; static_assert(std::is_same::value, "n12 % c3 == n0"); constexpr auto n12_land_n3 = n12 & n3; static_assert(std::is_same::value, "n12 & n3 == n0"); constexpr auto n12_land_c3 = n12 & c3; static_assert(std::is_same::value, "n12 & c3 == n0"); constexpr auto n12_lor_n3 = n12 | n3; static_assert(std::is_same::value, "n12 | n3 == n15"); constexpr auto n12_lor_c3 = n12 | c3; static_assert(std::is_same::value, "n12 | c3 == n15"); constexpr auto n12_lxor_n3 = n12 ^ n3; static_assert(std::is_same::value, "n12 ^ n3 == n15"); constexpr auto n12_lxor_c3 = n12 ^ c3; static_assert(std::is_same::value, "n12 ^ c3 == n15"); constexpr auto n96 = xsimd::make_batch_constant, arch_type>(); constexpr auto n12_lshift_n3 = n12 << n3; static_assert(std::is_same::value, "n12 << n3 == n96"); constexpr auto n12_lshift_c3 = n12 << c3; static_assert(std::is_same::value, "n12 << c3 == n96"); constexpr auto n1 = xsimd::make_batch_constant, arch_type>(); constexpr auto n12_rshift_n3 = n12 >> n3; static_assert(std::is_same::value, "n12 >> n3 == n1"); constexpr auto n12_rshift_c3 = n12 >> c3; static_assert(std::is_same::value, "n12 >> c3 == n1"); constexpr auto n12_uadd = +n12; static_assert(std::is_same::value, "+n12 == n12"); constexpr auto n12_inv = ~n12; constexpr auto n12_inv_ = xsimd::make_batch_constant, arch_type>(); static_assert(std::is_same::value, "~n12 == n12_inv"); constexpr auto n12_usub = -n12; constexpr auto n12_usub_ = xsimd::make_batch_constant, arch_type>(); static_assert(std::is_same::value, "-n12 == n12_usub"); // comparison operators using true_batch_type = decltype(xsimd::make_batch_bool_constant()); using false_batch_type = decltype(xsimd::make_batch_bool_constant()); static_assert(std::is_same::value, "same type"); static_assert(std::is_same::value, "n12 == n12"); static_assert(std::is_same::value, "n12 == n3"); static_assert(std::is_same::value, "n12 == c3"); static_assert(std::is_same::value, "n12 != n12"); static_assert(std::is_same::value, "n12 != n3"); static_assert(std::is_same::value, "n12 != c3"); static_assert(std::is_same::value, "n12 < n12"); static_assert(std::is_same::value, "n12 < n3"); static_assert(std::is_same::value, "n12 < c3"); static_assert(std::is_same n12), false_batch_type>::value, "n12 > n12"); static_assert(std::is_same n3), true_batch_type>::value, "n12 > n3"); static_assert(std::is_same c3), true_batch_type>::value, "n12 > c3"); static_assert(std::is_same::value, "n12 <= n12"); static_assert(std::is_same::value, "n12 <= n3"); static_assert(std::is_same::value, "n12 <= c3"); static_assert(std::is_same= n12), true_batch_type>::value, "n12 >= n12"); static_assert(std::is_same= n3), true_batch_type>::value, "n12 >= n3"); static_assert(std::is_same= c3), true_batch_type>::value, "n12 >= c3"); } }; TEST_CASE_TEMPLATE("[constant batch]", B, BATCH_INT_TYPES) { constant_batch_test Test; SUBCASE("init_from_constant") { Test.test_init_from_constant(); } SUBCASE("test_init_from_array") { Test.test_init_from_array(); } SUBCASE("init_from_generator") { Test.test_init_from_generator(); } SUBCASE("as_batch") { Test.test_cast(); } SUBCASE("init_from_generator_arange") { Test.test_init_from_generator_arange(); } SUBCASE("init_from_constant_generator") { Test.test_init_from_constant_generator(); } SUBCASE("operators") { Test.test_ops(); } } template struct constant_bool_batch_test { using batch_type = B; using value_type = typename B::value_type; using arch_type = typename B::arch_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; using batch_bool_type = typename batch_type::batch_bool_type; struct generator { static constexpr bool get(size_t index, size_t /*size*/) { return index % 2; } }; void test_init_from_constant() const { bool_array_type expected; std::generate(expected.begin(), expected.end(), []() { return false; }); constexpr auto b = xsimd::make_batch_bool_constant(); INFO("batch_bool_constant(value_type)"); CHECK_BATCH_EQ((batch_bool_type)b, expected); } void test_init_from_array() const { #if __cplusplus >= 202002L constexpr bool_array_type expected = []() { bool_array_type out = {}; for (std::size_t k = 0; k < out.size(); ++k) { out[k] = k % 2 == 0; } return out; }(); constexpr auto b = xsimd::make_batch_bool_constant(); INFO("batch_bool_constant(value_type)"); CHECK_BATCH_EQ((batch_bool_type)b, expected); #endif } void test_init_from_generator() const { bool_array_type expected; size_t i = 0; std::generate(expected.begin(), expected.end(), [&i]() { return generator::get(i++, size); }); constexpr auto b = xsimd::make_batch_bool_constant(); INFO("batch_bool_constant(value_type)"); CHECK_BATCH_EQ((batch_bool_type)b, expected); } struct split { static constexpr bool get(size_t index, size_t size) { return index < size / 2; } }; void test_init_from_generator_split() const { bool_array_type expected; size_t i = 0; std::generate(expected.begin(), expected.end(), [&i]() { return split::get(i++, size); }); constexpr auto b = xsimd::make_batch_bool_constant(); INFO("batch_bool_constant(value_type)"); CHECK_BATCH_EQ((batch_bool_type)b, expected); } struct inv_split { static constexpr bool get(size_t index, size_t size) { return !split().get(index, size); } }; template struct constant { static constexpr bool get(size_t /*index*/, size_t /*size*/) { return Val; } }; void test_cast() const { constexpr auto all_true = xsimd::make_batch_bool_constant, arch_type>(); auto b0 = all_true.as_batch_bool(); auto b1 = (batch_bool_type)all_true; CHECK_BATCH_EQ(b0, batch_bool_type(true)); CHECK_BATCH_EQ(b1, batch_bool_type(true)); } void test_ops() const { constexpr auto all_true = xsimd::make_batch_bool_constant, arch_type>(); constexpr auto all_false = xsimd::make_batch_bool_constant, arch_type>(); constexpr auto x = xsimd::make_batch_bool_constant(); constexpr auto y = xsimd::make_batch_bool_constant(); constexpr auto x_or_y = x | y; static_assert(std::is_same::value, "x | y == true"); constexpr auto x_lor_y = x || y; static_assert(std::is_same::value, "x || y == true"); constexpr auto x_and_y = x & y; static_assert(std::is_same::value, "x & y == false"); constexpr auto x_land_y = x && y; static_assert(std::is_same::value, "x && y == false"); constexpr auto x_xor_y = x ^ y; static_assert(std::is_same::value, "x ^ y == true"); constexpr auto not_x = !x; static_assert(std::is_same::value, "!x == y"); constexpr auto inv_x = ~x; static_assert(std::is_same::value, "~x == y"); } }; TEST_CASE_TEMPLATE("[constant bool batch]", B, BATCH_INT_TYPES) { constant_bool_batch_test Test; SUBCASE("init_from_constant") { Test.test_init_from_constant(); } SUBCASE("test_init_from_array") { Test.test_init_from_array(); } SUBCASE("init_from_generator") { Test.test_init_from_generator(); } SUBCASE("as_batch") { Test.test_cast(); } SUBCASE("init_from_generator_split") { Test.test_init_from_generator_split(); } SUBCASE("operators") { Test.test_ops(); } } #endif xtensor-stack-xsimd-541558d/test/test_batch_float.cpp000066400000000000000000000104651517435117100227250ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/xsimd.hpp" #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include "test_utils.hpp" template struct batch_float_test { using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; array_type lhs; array_type rhs; batch_float_test() { for (size_t i = 0; i < size; ++i) { lhs[i] = value_type(i) / 4 + value_type(1.2) * std::sqrt(value_type(i + 0.25)); if (lhs[i] == value_type(0)) { lhs[i] += value_type(0.1); } rhs[i] = value_type(10.2) / (i + 2) + value_type(0.25); } } void test_reciprocal() const { // reciprocal { array_type res, expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return value_type(1) / l; }); batch_type res1 = reciprocal(batch_lhs()); res1.store_unaligned(res.data()); size_t diff = detail::get_nb_diff_near(res, expected, 1e-12f); INFO("reciprocal"); CHECK_EQ(diff, 0); } } void test_rsqrt() const { // rsqrt { array_type res, expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return std::ceil((value_type(1) / std::sqrt(l)) * value_type(100)); }); batch_type res1 = ceil(rsqrt(batch_lhs()) * value_type(100)); res1.store_unaligned(res.data()); size_t diff = detail::get_nb_diff_near(res, expected, 1.5f * std::pow(2, 12)); INFO("rsqrt"); CHECK_EQ(diff, 0); } } void test_sqrt() const { // sqrt { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [](const value_type& l) { return std::sqrt(l); }); batch_type res = sqrt(batch_lhs()); INFO("sqrt"); CHECK_BATCH_EQ(res, expected); } } void test_haddp() const { batch_type haddp_input[size]; for (size_t i = 0; i < size; i += 2) { haddp_input[i] = batch_lhs(); if (i + 1 < size) { haddp_input[i + 1] = batch_rhs(); } } array_type expected; std::fill(expected.begin(), expected.end(), value_type(0)); for (size_t i = 0; i < size; ++i) { for (size_t j = 0; j < size; j += 2) { expected[j] += lhs[i]; if (j + 1 < size) { expected[j + 1] += rhs[i]; } } } auto res = haddp(haddp_input); INFO("haddp"); CHECK_BATCH_EQ(res, expected); } private: batch_type batch_lhs() const { return batch_type::load_unaligned(lhs.data()); } batch_type batch_rhs() const { return batch_type::load_unaligned(rhs.data()); } }; TEST_CASE_TEMPLATE("[xsimd batch float]", B, BATCH_FLOAT_TYPES) { batch_float_test Test; SUBCASE("reciprocal") { Test.test_reciprocal(); } SUBCASE("sqrt") { Test.test_sqrt(); } SUBCASE("rsqrt") { Test.test_rsqrt(); } SUBCASE("haddp") { Test.test_haddp(); } } #endif xtensor-stack-xsimd-541558d/test/test_batch_int.cpp000066400000000000000000000260331517435117100224100ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/xsimd.hpp" #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include "test_utils.hpp" #include namespace xsimd { template struct test_int_min_max { bool run() { return true; } }; template struct test_int_min_max, 2> { void run() { using B = batch; using BB = batch_bool; using A = std::array; T max = std::numeric_limits::max(); T min = std::numeric_limits::min(); std::array maxmin_cmp { { max, min } }; B maxmin = { max, min }; INFO("numeric max and min"); CHECK_BATCH_EQ(maxmin, maxmin_cmp); B a = { 1, 3 }; B b(2); B c = { 2, 3 }; auto r1 = xsimd::max(a, c); auto r3 = xsimd::min(a, c); INFO("max"); CHECK_BATCH_EQ(r1, (A { { 2, 3 } })); INFO("min"); CHECK_BATCH_EQ(r3, (A { { 1, 3 } })); auto r4 = a < b; // test lt BB e4 = { 1, 0 }; CHECK_UNARY(xsimd::all(r4 == e4)); } }; template struct test_int_min_max, 4> { void run() { using B = batch; using BB = batch_bool; using A = std::array; B a = { 1, 3, 1, 1 }; B b(2); B c = { 2, 3, 2, 3 }; auto r1 = xsimd::max(a, c); auto r3 = xsimd::min(a, c); INFO("max"); CHECK_BATCH_EQ(r1, (A { { 2, 3, 2, 3 } })); INFO("min"); CHECK_BATCH_EQ(r3, (A { { 1, 3, 1, 1 } })); auto r4 = a < b; // test lt BB e4 = { 1, 0, 1, 1 }; CHECK_UNARY(xsimd::all(r4 == e4)); } }; template struct test_int_min_max, 8> { void run() { using B = batch; using BB = batch_bool; using A = std::array; T max = std::numeric_limits::max(); T min = std::numeric_limits::min(); std::array maxmin_cmp { { 0, 0, max, 0, min, 0, 0, 0 } }; B maxmin = { 0, 0, max, 0, min, 0, 0, 0 }; INFO("numeric max and min"); CHECK_BATCH_EQ(maxmin, maxmin_cmp); B a { 1, 3, 1, 3, 1, 1, 3, 3 }; B b { 2 }; B c { 2, 3, 2, 3, 2, 3, 2, 3 }; auto r1 = xsimd::max(a, c); auto r3 = xsimd::min(a, c); auto r4 = a < b; // test lt INFO("max"); CHECK_BATCH_EQ(r1, (A { { 2, 3, 2, 3, 2, 3, 3, 3 } })); INFO("min"); CHECK_BATCH_EQ(r3, (A { { 1, 3, 1, 3, 1, 1, 2, 3 } })); BB e4 = { 1, 0, 1, 0, 1, 1, 0, 0 }; CHECK_UNARY(xsimd::all(r4 == e4)); } }; template struct test_int_min_max, 16> { void run() { using B = batch; using BB = batch_bool; using A = std::array; T max = std::numeric_limits::max(); T min = std::numeric_limits::min(); std::array maxmin_cmp { { 0, 0, max, 0, min, 0, 0, 0, 0, 0, max, 0, min, 0, 0, 0 } }; B maxmin = { 0, 0, max, 0, min, 0, 0, 0, 0, 0, max, 0, min, 0, 0, 0 }; INFO("numeric max and min"); CHECK_BATCH_EQ(maxmin, maxmin_cmp); B a = { 1, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3, min, max, max, min }; B b(2); B c = { 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3 }; auto r1 = xsimd::max(a, b); auto r3 = xsimd::min(a, b); auto r4 = a < b; // test lt auto r5 = a == c; auto r6 = a != c; INFO("max"); CHECK_BATCH_EQ(r1, (A { { 2, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 2, max, max, 2 } })); INFO("min"); CHECK_BATCH_EQ(r3, (A { { 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, min, 2, 2, min } })); BB e4 = { 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1 }; CHECK_UNARY(xsimd::all(r4 == e4)); BB e5 = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0 }; CHECK_UNARY(xsimd::all(r5 == e5)); CHECK_UNARY(xsimd::all(r6 == !e5)); } }; template struct test_int_min_max, 32> { void run() { using B = batch; using BB = batch_bool; using A = std::array; T max = std::numeric_limits::max(); T min = std::numeric_limits::min(); B a = { 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3, min, max, max, min }; B b = 2; auto r1 = xsimd::max(a, b); auto r3 = xsimd::min(a, b); auto r4 = a < b; // test lt INFO("max"); CHECK_BATCH_EQ(r1, (A { { 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 2, max, max, 2 } })); INFO("min"); CHECK_BATCH_EQ(r3, (A { { 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, min, 2, 2, min } })); BB e4 = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1 }; CHECK_UNARY(xsimd::all(r4 == e4)); } }; } template struct batch_int_test { using batch_type = B; using value_type = typename B::value_type; static constexpr size_t size = B::size; using array_type = std::array; using bool_array_type = std::array; array_type lhs; array_type rhs; array_type shift; batch_int_test() { using signed_value_type = std::make_signed_t; for (size_t i = 0; i < size; ++i) { bool negative_lhs = std::is_signed::value && (i % 2 == 1); lhs[i] = value_type(i) * (negative_lhs ? -10 : 10); if (lhs[i] == value_type(0)) { lhs[i] += value_type(1); } rhs[i] = value_type(i) + value_type(4); shift[i] = signed_value_type(i) % (CHAR_BIT * sizeof(value_type)); } } void test_modulo() const { // batch % batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), rhs.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l % r; }); batch_type res = batch_lhs() % batch_rhs(); INFO("batch % batch"); CHECK_BATCH_EQ(res, expected); } } void test_shift() const { int32_t nb_sh = 3; // batch << scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [nb_sh](const value_type& v) { return xsimd::abs(v) << nb_sh; }); batch_type res = abs(batch_lhs()) << nb_sh; INFO("batch << scalar"); CHECK_BATCH_EQ(res, expected); } // batch << batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), shift.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return xsimd::abs(l) << r; }); batch_type res = abs(batch_lhs()) << batch_shift(); INFO("batch << batch"); CHECK_BATCH_EQ(res, expected); } // batch >> scalar { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), expected.begin(), [nb_sh](const value_type& v) { return v >> nb_sh; }); batch_type res = batch_lhs() >> nb_sh; INFO("batch >> scalar"); CHECK_BATCH_EQ(res, expected); } // batch >> batch { array_type expected; std::transform(lhs.cbegin(), lhs.cend(), shift.cbegin(), expected.begin(), [](const value_type& l, const value_type& r) { return l >> r; }); batch_type res = batch_lhs() >> batch_shift(); INFO("batch >> batch"); CHECK_BATCH_EQ(res, expected); } } void test_more_shift() const { int32_t s = static_cast(sizeof(value_type) * 8); batch_type lhs = batch_type(value_type(1)); batch_type res; for (int32_t i = 0; i < s; ++i) { res = lhs << i; batch_type expected(value_type(1) << i); CHECK_BATCH_EQ(res, expected); } lhs = batch_type(std::numeric_limits::max()); for (int32_t i = 0; i < s; ++i) { res = lhs >> i; batch_type expected(std::numeric_limits::max() >> i); CHECK_BATCH_EQ(res, expected); } } void test_min_max() const { xsimd::test_int_min_max t; t.run(); } void test_less_than_underflow() const { batch_type test_negative_compare = batch_type(5) - 6; if (std::is_unsigned::value) { CHECK_FALSE(xsimd::any(test_negative_compare < 1)); } else { CHECK_UNARY(xsimd::all(test_negative_compare < 1)); } } private: batch_type batch_lhs() const { return batch_type::load_unaligned(lhs.data()); } batch_type batch_rhs() const { return batch_type::load_unaligned(rhs.data()); } batch_type batch_shift() const { return batch_type::load_unaligned(shift.data()); } }; TEST_CASE_TEMPLATE("[batch int tests]", B, BATCH_INT_TYPES) { batch_int_test Test; SUBCASE("modulo") { Test.test_modulo(); } SUBCASE("shift") { Test.test_shift(); } SUBCASE("more_shift") { Test.test_more_shift(); } SUBCASE("min_max") { Test.test_min_max(); } SUBCASE("less_than_underflow") { Test.test_less_than_underflow(); } } #endif xtensor-stack-xsimd-541558d/test/test_batch_manip.cpp000066400000000000000000000331701517435117100227220ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * Copyright (c) Marco Barbone * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd/xsimd.hpp" #ifndef XSIMD_NO_SUPPORTED_ARCHITECTURE #include "test_utils.hpp" // Compile time tests for include/xsimd/arch/common/xsimd_common_swizzle.hpp namespace xsimd { namespace kernel { namespace detail { // ──────────────────────────────────────────────────────────────────────── // compile-time tests (identity, all-different, dup-lo, dup-hi) // 8-lane identity static_assert(is_identity(), "identity failed"); // 8-lane reverse is not identity static_assert(!is_identity(), "identity on reverse"); // 8-lane dup-lo (repeat 0..3 twice) static_assert(is_dup_lo(), "dup_lo failed"); static_assert(!is_dup_hi(), "dup_hi on dup_lo"); // 8-lane dup-hi (repeat 4..7 twice) static_assert(is_dup_hi(), "dup_hi failed"); static_assert(!is_dup_lo(), "dup_lo on dup_hi"); // 8-lane is-only-from-hi (repeat 4..7 twice) static_assert(is_only_from_hi(), "only_from_hi on hi"); static_assert(!is_only_from_hi(), "only_from_hi failed"); // 8-lane is-only-from-lo (repeat 4..7 twice) static_assert(is_only_from_lo(), "only_from_lo on lo"); static_assert(!is_only_from_lo(), "only_from_lo failed"); // ──────────────────────────────────────────────────────────────────────── // 4-lane identity static_assert(is_identity(), "4-lane identity failed"); // 4-lane reverse is not identity static_assert(!is_identity(), "4-lane identity on reverse"); // 4-lane dup-lo (repeat 0..1 twice) static_assert(is_dup_lo(), "4-lane dup_lo failed"); static_assert(!is_dup_hi(), "4-lane dup_hi on dup_lo"); // 4-lane dup-hi (repeat 2..3 twice) static_assert(is_dup_hi(), "4-lane dup_hi failed"); static_assert(!is_dup_lo(), "4-lane dup_lo on dup_hi"); static_assert(is_cross_lane(), "dup-lo only → crossing"); static_assert(is_cross_lane(), "dup-hi only → crossing"); static_assert(is_cross_lane(), "one low + rest high → crossing"); static_assert(!is_cross_lane(), "mixed low/high → no crossing"); static_assert(!is_cross_lane(), "mixed low/high → no crossing"); // 8-element 128-bit lane crossing checks // For 8 doubles (64 bytes): lanes are [0-1], [2-3], [4-5], [6-7] static_assert(!is_cross_lane(), "8-lane reverse within 128-bit lanes → no crossing"); static_assert(!is_cross_lane(), "identity 8-lane → no crossing"); static_assert(is_cross_lane(), "8-lane double swap first two 128-bit lanes → crossing"); // For 8 int32 (32 bytes): lanes are [0-3], [4-7] static_assert(is_cross_lane(), "8-lane int32_t swap 128-bit lanes → crossing"); // Additional compile-time checks for 16-element batches (e.g. float/int32) static_assert(is_cross_lane(), "16-lane 128-bit swap → crossing"); static_assert(!is_cross_lane(), "identity 16-lane → no crossing"); static_assert(is_cross_lane(), "16-lane uint32_t swap → crossing"); // Explicit 128-bit lane boundary checks (LaneSizeBytes = 16) // For float (4 bytes): 16 bytes = 4 elements per 128-bit lane static_assert(detail::is_cross_lane_with_lane_size<16, float, std::size_t, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15>(), "float: swap first two 128-bit lanes → crossing"); static_assert(!detail::is_cross_lane_with_lane_size<16, float, std::size_t, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12>(), "float: reverse within each 128-bit lane → no crossing"); // For double (8 bytes): 16 bytes = 2 elements per 128-bit lane static_assert(detail::is_cross_lane_with_lane_size<16, double, std::size_t, 2, 3, 0, 1, 4, 5, 6, 7>(), "double: swap first two 128-bit lanes → crossing"); static_assert(!detail::is_cross_lane_with_lane_size<16, double, std::size_t, 1, 0, 3, 2, 5, 4, 7, 6>(), "double: reverse within each 128-bit lane → no crossing"); } } } namespace xsimd { template