pax_global_header00006660000000000000000000000064147573662410014531gustar00rootroot0000000000000052 comment=1f8dd9c8e162968d9b4ff0251c56d431b8777f36 xsimd-13.2.0/000077500000000000000000000000001475736624100127405ustar00rootroot00000000000000xsimd-13.2.0/.clang-format000066400000000000000000000002301475736624100153060ustar00rootroot00000000000000--- BasedOnStyle: WebKit AlignAfterOpenBracket: Align AlignConsecutiveDeclarations: 'false' BreakBeforeBraces: Allman NamespaceIndentation: All ... xsimd-13.2.0/.github/000077500000000000000000000000001475736624100143005ustar00rootroot00000000000000xsimd-13.2.0/.github/cmake-test/000077500000000000000000000000001475736624100163355ustar00rootroot00000000000000xsimd-13.2.0/.github/cmake-test/CMakeLists.txt000066400000000000000000000002471475736624100211000ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.19) project(test VERSION 0.0.1) find_package(xsimd REQUIRED) add_executable(test main.cpp) target_link_libraries(test PUBLIC xsimd) xsimd-13.2.0/.github/cmake-test/main.cpp000066400000000000000000000000711475736624100177630ustar00rootroot00000000000000#include "xsimd/xsimd.hpp" int main() { return 0; } xsimd-13.2.0/.github/toolchains/000077500000000000000000000000001475736624100164435ustar00rootroot00000000000000xsimd-13.2.0/.github/toolchains/clang-aarch64-linux-gnu.cmake000066400000000000000000000001621475736624100237020ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR aarch64) set(triple aarch64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake) xsimd-13.2.0/.github/toolchains/clang-arm-linux-gnueabihf.cmake000066400000000000000000000001641475736624100243720ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR armv7-a) set(triple arm-linux-gnueabihf) include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake) xsimd-13.2.0/.github/toolchains/clang-riscv64-linux-gnu.cmake000066400000000000000000000001621475736624100237520ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR riscv64) set(triple riscv64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/clang.cmake) xsimd-13.2.0/.github/toolchains/clang.cmake000066400000000000000000000005341475736624100205330ustar00rootroot00000000000000set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_C_COMPILER clang) set(CMAKE_C_COMPILER_TARGET ${triple}) set(CMAKE_CXX_COMPILER clang++) set(CMAKE_CXX_COMPILER_TARGET ${triple}) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) xsimd-13.2.0/.github/toolchains/gcc-aarch64-linux-gnu.cmake000066400000000000000000000001601475736624100233500ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR aarch64) set(triple aarch64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xsimd-13.2.0/.github/toolchains/gcc-arm-linux-gnueabihf.cmake000066400000000000000000000001621475736624100240400ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR armv7-a) set(triple arm-linux-gnueabihf) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xsimd-13.2.0/.github/toolchains/gcc-riscv64-linux-gnu.cmake000066400000000000000000000001601475736624100234200ustar00rootroot00000000000000set(CMAKE_SYSTEM_PROCESSOR riscv64) set(triple riscv64-linux-gnu) include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) xsimd-13.2.0/.github/toolchains/gcc.cmake000066400000000000000000000004321475736624100202000ustar00rootroot00000000000000set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_C_COMPILER ${triple}-gcc) set(CMAKE_CXX_COMPILER ${triple}-g++) set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) xsimd-13.2.0/.github/workflows/000077500000000000000000000000001475736624100163355ustar00rootroot00000000000000xsimd-13.2.0/.github/workflows/android.yml000066400000000000000000000021151475736624100204770ustar00rootroot00000000000000name: Android build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: test: runs-on: ubuntu-latest strategy: matrix: target: - armeabi-v7a - arm64-v8a - x86 - x86_64 api: - 16 - 18 steps: - name: Checkout uses: actions/checkout@v3 - name: Build script env: TARGET: ${{ matrix.target }} API: ${{ matrix.api }} run: | mkdir _build NDK="$($ANDROID_HOME/cmdline-tools/latest/bin/sdkmanager --list_installed | sed -E 's/( +[|] +)/|/g;s/ +$//' | grep '^ ndk' | cut -d '|' -f 4 | sort | head -n1)" cd _build && \ cmake .. -DCMAKE_TOOLCHAIN_FILE=$ANDROID_HOME/$NDK/build/cmake/android.toolchain.cmake \ -DANDROID_ABI=$ABI \ -DANDROID_PLATFORM=android-$API \ -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release cmake --build . --verbose xsimd-13.2.0/.github/workflows/arch-consistency-check.yml000066400000000000000000000006631475736624100234140ustar00rootroot00000000000000name: Arch consistency check on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest steps: - name: Checkout xsimd uses: actions/checkout@v3 - name: Install dependencies run: sudo apt install g++ - name: Check architecture consistency run: cd test && sh ./check_arch.sh xsimd-13.2.0/.github/workflows/benchmark.yml000066400000000000000000000011341475736624100210110ustar00rootroot00000000000000name: benchmark & examples on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Install dependencies run: | sudo apt install g++ - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release - name: Build run: cmake --build _build - name: Testing sequential run: cmake --build _build --target xbenchmark xsimd-13.2.0/.github/workflows/cmake.yml000066400000000000000000000015221475736624100201400ustar00rootroot00000000000000name: CMake integration on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true defaults: run: shell: bash -l {0} jobs: build: runs-on: ubuntu-20.04 steps: - name: Checkout xsimd uses: actions/checkout@v3 - name: Configure build run: | mkdir _build && cd _build cmake .. -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_PREFIX=_install - name: Build run: cmake --build _build --target install - name: Check install run: | mkdir _install_build && cd _install_build cp ${{ github.workspace }}/.github/cmake-test/* . ls $PWD/../_build/_install/share/cmake/xsimd cmake . -DCMAKE_PREFIX_PATH=$PWD/../_build/_install/share/cmake/xsimd cmake --build . xsimd-13.2.0/.github/workflows/cross-rvv.yml000066400000000000000000000046061475736624100210320ustar00rootroot00000000000000name: RISC-V RVV cross-compilation build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true env: GCC_VERSION: "12" jobs: build: runs-on: ubuntu-22.04 name: 'RISC-V RVV${{ matrix.vector_bits }}' strategy: matrix: vector_bits: - 128 - 256 - 512 LLVM_VERSION: - 17 - 18 steps: - name: Setup GCC run: | sudo apt-get -y -qq update sudo apt-get -y -qq --no-install-suggests --no-install-recommends install gcc-${GCC_VERSION}-riscv64-linux-gnu g++-${GCC_VERSION}-riscv64-linux-gnu sudo update-alternatives --install /usr/bin/riscv64-linux-gnu-gcc riscv64-linux-gnu-gcc /usr/bin/riscv64-linux-gnu-gcc-${GCC_VERSION} 20 sudo update-alternatives --install /usr/bin/riscv64-linux-gnu-g++ riscv64-linux-gnu-g++ /usr/bin/riscv64-linux-gnu-g++-${GCC_VERSION} 20 - name: Setup LLVM run: | # Install latest LLVM stable curl -o llvm.sh https://apt.llvm.org/llvm.sh chmod u+x llvm.sh sudo ./llvm.sh ${{ matrix.LLVM_VERSION }} sudo ln -srf $(which clang-${{ matrix.LLVM_VERSION }}) /usr/bin/clang sudo ln -srf $(which clang++-${{ matrix.LLVM_VERSION }}) /usr/bin/clang++ rm llvm.sh - name: Setup QEMU uses: docker/setup-qemu-action@v3.0.0 with: platforms: riscv64 - name: Setup Ninja run: | sudo apt-get -y -qq install ninja-build - name: Checkout xsimd uses: actions/checkout@v3 - name: Setup run: > cmake -S . -B _build -GNinja -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DTARGET_ARCH=generic -DCMAKE_C_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl" -DCMAKE_CXX_FLAGS="-march=rv64gcv_zvl${{ matrix.vector_bits }}b_zba_zbb_zbs -mrvv-vector-bits=zvl" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/clang-riscv64-linux-gnu.cmake - name: Build run: cmake --build _build - name: Testing xsimd run: > QEMU_CPU="rv64,zba=true,zbb=true,zbs=true,v=true,vlen=${{ matrix.vector_bits }},elen=64,vext_spec=v1.0" QEMU_LD_PREFIX="/usr/riscv64-linux-gnu" ./test/test_xsimd working-directory: ${{ github.workspace }}/_build xsimd-13.2.0/.github/workflows/cross-sve.yml000066400000000000000000000033461475736624100210120ustar00rootroot00000000000000name: Arm-SVE cross-compilation build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-20.04 name: 'Arm SVE${{ matrix.vector_bits }}' strategy: matrix: vector_bits: - 128 - 256 - 512 steps: - name: Setup compiler run: | sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++-10-aarch64-linux-gnu || exit 1 sudo update-alternatives --install /usr/bin/aarch64-linux-gnu-gcc aarch64-linux-gnu-gcc /usr/bin/aarch64-linux-gnu-gcc-10 20 sudo update-alternatives --install /usr/bin/aarch64-linux-gnu-g++ aarch64-linux-gnu-g++ /usr/bin/aarch64-linux-gnu-g++-10 20 - name: Setup QEMU run: | sudo apt-get --no-install-suggests --no-install-recommends install qemu-user - name: Setup Ninja run: | sudo apt-get install ninja-build - name: Checkout xsimd uses: actions/checkout@v3 - name: Setup run: | mkdir _build cd _build && cmake .. -GNinja -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DTARGET_ARCH=generic -DCMAKE_C_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" -DCMAKE_CXX_FLAGS="-march=armv8-a+sve -msve-vector-bits=${{ matrix.vector_bits }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/gcc-aarch64-linux-gnu.cmake - name: Build run: cmake --build _build - name: Testing xsimd run: | qemu-aarch64 --cpu max,sve${{ matrix.vector_bits }}=on -L /usr/aarch64-linux-gnu/ ./test/test_xsimd working-directory: ${{ github.workspace }}/_build xsimd-13.2.0/.github/workflows/cross.yml000066400000000000000000000070631475736624100202170ustar00rootroot00000000000000name: Arm cross-compilation build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest name: '${{ matrix.target.arch }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}' strategy: matrix: target: - { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=neon', full: 'ON'} - { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=vfpv3-d16', full: 'OFF' } # no neon - { platform: 'aarch64', arch: 'armv8-a', dir: 'aarch64-linux-gnu', flags: '', full: 'ON' } sys: - { compiler: 'gcc', version: '9' } # - { compiler: 'clang', version: '17' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'clang' }} run: | LLVM_VERSION=${{ matrix.sys.version }} wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - || exit 1 if [[ $LLVM_VERSION -eq 'latest' ]]; then sudo add-apt-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal main" || exit 1 else sudo add-apt-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-$LLVM_VERSION main" || exit 1 fi sudo apt-get update || exit 1 if [[ $LLVM_VERSION -eq 'latest' ]]; then sudo apt-get --no-install-suggests --no-install-recommends install clang || exit 1 else sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1 fi sudo apt-get --no-install-suggests --no-install-recommends install g++-9-${{ matrix.target.dir }} g++-9-multilib || exit 1 - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} run: | sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib || exit 1 sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20 sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20 - name: Setup QEMU run: | sudo apt-get --no-install-suggests --no-install-recommends install qemu-user - name: Setup Ninja run: | sudo apt-get install ninja-build - name: Checkout xsimd uses: actions/checkout@v3 - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release -DTARGET_ARCH=generic -DCMAKE_C_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" -DCMAKE_CXX_FLAGS="-march=${{ matrix.target.arch }} ${{ matrix.target.flags }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake - name: Build run: cmake --build _build - name: Testing xsimd run: | qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd working-directory: ${{ github.workspace }}/_build xsimd-13.2.0/.github/workflows/cxx-no-exceptions.yml000066400000000000000000000007101475736624100224510ustar00rootroot00000000000000name: C++ -fno-except compatibility on: [push, pull_request] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Install dependencies run: | sudo apt install g++ - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-fno-exceptions - name: Build run: cmake --build _build xsimd-13.2.0/.github/workflows/cxx-versions.yml000066400000000000000000000011731475736624100215320ustar00rootroot00000000000000name: C++ compatibility build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest strategy: matrix: cxx-version: [11, 14, 17, 20] steps: - uses: actions/checkout@v3 - name: Install dependencies run: | sudo apt install g++ - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=${{matrix.cxx-version}} - name: Build run: cmake --build _build xsimd-13.2.0/.github/workflows/doxygen.yml000066400000000000000000000005751475736624100205440ustar00rootroot00000000000000name: doc on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Install dependencies run: sudo apt install doxygen python3-breathe python3-sphinx-rtd-theme - name: Render run: make -C docs xsimd-13.2.0/.github/workflows/emscripten.yml000066400000000000000000000012301475736624100212250ustar00rootroot00000000000000name: Emscripten build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: test: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v3 - uses: mamba-org/setup-micromamba@v1 with: environment-name: xsimd create-args: >- microsoft::playwright python init-shell: bash - name: Build script shell: bash -el {0} run: | echo "Build script for wasm" playwright install ./test/test_wasm/test_wasm.sh xsimd-13.2.0/.github/workflows/emulated.yml000066400000000000000000000052411475736624100206620ustar00rootroot00000000000000name: Linux emulated build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true defaults: run: shell: bash -l {0} jobs: build: runs-on: ubuntu-20.04 name: '${{ matrix.sys.compiler }} ${{ matrix.sys.version }} - emulated' strategy: matrix: sys: - { compiler: 'gcc', version: '7'} - { compiler: 'clang', version: '8'} steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} run: | GCC_VERSION=${{ matrix.sys.version }} sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install g++-$GCC_VERSION CC=gcc-$GCC_VERSION echo "CC=$CC" >> $GITHUB_ENV CXX=g++-$GCC_VERSION echo "CXX=$CXX" >> $GITHUB_ENV CXXFLAGS="-Wno-noexcept-type -Wno-stringop-overflow -Wno-maybe-uninitialized" echo "CXXFLAGS=$CXXFLAGS" >> $GITHUB_ENV - name: Setup compiler if: ${{ matrix.sys.compiler == 'clang' }} run: | LLVM_VERSION=${{ matrix.sys.version }} #sudo add-apt-repository ppa:ubuntu-toolchain-r/test || exit 1 wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - || exit 1 sudo add-apt-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal main" || exit 1 sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++-9 g++-9-multilib || exit 1 sudo ln -s /usr/include/asm-generic /usr/include/asm CC=clang-$LLVM_VERSION echo "CC=$CC" >> $GITHUB_ENV CXX=clang++-$LLVM_VERSION echo "CXX=$CXX" >> $GITHUB_ENV - name: Checkout xsimd uses: actions/checkout@v3 - name: Install mamba uses: mamba-org/setup-micromamba@v1 with: environment-file: environment.yml - name: Configure build env: CC: ${{ env.CC }} CXX: ${{ env.CXX }} run: | mkdir _build cd _build cmake .. -DBUILD_TESTS=ON \ -DBUILD_BENCHMARK=ON \ -DBUILD_EXAMPLES=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=$CC \ -DCMAKE_CXX_COMPILER=$CXX \ -DXSIMD_ENABLE_WERROR=ON \ -DCMAKE_CXX_FLAGS="-DXSIMD_DEFAULT_ARCH=emulated\<128\> -DXSIMD_WITH_EMULATED=1 ${CXXFLAGS}" \ -G Ninja - name: Build run: ninja -C _build - name: Test run: | cd _build/test ./test_xsimd xsimd-13.2.0/.github/workflows/linux.yml000066400000000000000000000133001475736624100202140ustar00rootroot00000000000000name: Linux build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true defaults: run: shell: bash -l {0} jobs: build: runs-on: ubuntu-20.04 name: '${{ matrix.sys.compiler }} ${{ matrix.sys.version }} - ${{ matrix.sys.flags }}' strategy: matrix: sys: - { compiler: 'gcc', version: '7', flags: 'force_no_instr_set' } - { compiler: 'gcc', version: '8', flags: 'enable_xtl_complex' } - { compiler: 'gcc', version: '9', flags: 'avx' } #- { compiler: 'gcc', version: '10', flags: 'avx512' } buggy - { compiler: 'gcc', version: '11', flags: 'avx512' } - { compiler: 'gcc', version: '11', flags: 'i386' } - { compiler: 'gcc', version: '11', flags: 'avx512pf' } - { compiler: 'gcc', version: '11', flags: 'avx512vbmi' } - { compiler: 'gcc', version: '11', flags: 'avx512vnni' } - { compiler: 'clang', version: '8', flags: 'force_no_instr_set' } - { compiler: 'clang', version: '10', flags: 'enable_xtl_complex' } - { compiler: 'clang', version: '12', flags: 'avx' } - { compiler: 'clang', version: '13', flags: 'sse3' } - { compiler: 'clang', version: '14', flags: 'avx512' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} run: | GCC_VERSION=${{ matrix.sys.version }} if [[ $GCC_VERSION == '6' || $GCC_VERSION == '7' || $GCC_VERSION == '8' ]]; then #sudo add-apt-repository ppa:ubuntu-toolchain-r/test sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install g++-$GCC_VERSION fi if [[ '${{ matrix.sys.flags }}' -eq 'i386' ]]; then sudo dpkg --add-architecture i386 sudo add-apt-repository ppa:ubuntu-toolchain-r/test sudo apt-get update sudo apt-get --no-install-suggests --no-install-recommends install gcc-$GCC_VERSION-multilib g++-$GCC_VERSION-multilib linux-libc-dev:i386 fi CC=gcc-$GCC_VERSION echo "CC=$CC" >> $GITHUB_ENV CXX=g++-$GCC_VERSION echo "CXX=$CXX" >> $GITHUB_ENV - name: Setup compiler if: ${{ matrix.sys.compiler == 'clang' }} run: | LLVM_VERSION=${{ matrix.sys.version }} #sudo add-apt-repository ppa:ubuntu-toolchain-r/test || exit 1 wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add - || exit 1 if [[ $LLVM_VERSION -ge 13 ]]; then sudo add-apt-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-$LLVM_VERSION main" || exit 1 else sudo add-apt-repository "deb http://apt.llvm.org/focal/ llvm-toolchain-focal main" || exit 1 fi || exit 1 sudo apt-get update || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install clang-$LLVM_VERSION || exit 1 sudo apt-get --no-install-suggests --no-install-recommends install g++-9 g++-9-multilib || exit 1 sudo ln -s /usr/include/asm-generic /usr/include/asm CC=clang-$LLVM_VERSION echo "CC=$CC" >> $GITHUB_ENV CXX=clang++-$LLVM_VERSION echo "CXX=$CXX" >> $GITHUB_ENV - name: Checkout xsimd uses: actions/checkout@v3 - name: Install mamba uses: mamba-org/setup-micromamba@v1 with: environment-file: environment.yml - name: Setup SDE if: startswith(matrix.sys.flags, 'avx512') run: sh install_sde.sh - name: Configure build env: CC: ${{ env.CC }} CXX: ${{ env.CXX }} run: | if [[ '${{ matrix.sys.flags }}' == 'enable_xtl_complex' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DENABLE_XTL_COMPLEX=ON" fi if [[ '${{ matrix.sys.flags }}' == 'avx' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=sandybridge" fi if [[ '${{ matrix.sys.flags }}' == 'sse3' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=nocona" fi if [[ '${{ matrix.sys.flags }}' == 'avx512' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=skylake-avx512" fi if [[ '${{ matrix.sys.flags }}' == 'avx512pf' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knl" fi if [[ '${{ matrix.sys.flags }}' == 'avx512vbmi' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=cannonlake" fi if [[ '${{ matrix.sys.flags }}' == 'avx512vnni' ]]; then CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DTARGET_ARCH=knm" fi if [[ '${{ matrix.sys.flags }}' == 'i386' ]]; then CXX_FLAGS="$CXX_FLAGS -m32" fi if [[ '${{ matrix.sys.flags }}' == 'force_no_instr_set' ]]; then : else CMAKE_EXTRA_ARGS="$CMAKE_EXTRA_ARGS -DXSIMD_ENABLE_WERROR=ON" fi # Cheap way of spotting uninitialized read CXX_FLAGS="$CXX_FLAGS -ftrivial-auto-var-init=pattern" mkdir _build cd _build cmake .. -DBUILD_TESTS=ON \ -DBUILD_BENCHMARK=ON \ -DBUILD_EXAMPLES=ON \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_C_COMPILER=$CC \ -DCMAKE_CXX_COMPILER=$CXX \ $CMAKE_EXTRA_ARGS \ -DCMAKE_CXX_FLAGS='$CXX_FLAGS' \ -G Ninja - name: Build run: ninja -C _build - name: Test run: | cd _build cd test if echo '${{ matrix.sys.flags }}' | grep -q 'avx512' ; then ../../sde-external-8.69.1-2021-07-18-lin/sde64 -skx -- ./test_xsimd else ./test_xsimd fi xsimd-13.2.0/.github/workflows/macos.yml000066400000000000000000000014571475736624100201710ustar00rootroot00000000000000name: macOS build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: strategy: matrix: os: - 13 - 14 runs-on: macos-${{ matrix.os }} name: 'macos-${{ matrix.os }}' steps: - uses: actions/checkout@v3 - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" - name: Build run: cmake --build _build --verbose - name: Testing sequential run: cmake --build _build --target xbenchmark --verbose - name: Testing xsimd run: ${{github.workspace}}/_build/test/test_xsimd xsimd-13.2.0/.github/workflows/style-check.yml000066400000000000000000000012451475736624100212750ustar00rootroot00000000000000name: style check on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: formatting-check: name: Format check runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Run clang-format style check for C/C++ programs. uses: jidicula/clang-format-action@v4.11.0 with: clang-format-version: '17' exclude-regex: 'doctest.h' inlining-check: runs-on: ubuntu-latest name: Check inline keyword usage steps: - uses: actions/checkout@v2 - run: sudo apt install clang-tools - run: sh ./test/check_inline_specifier.sh . xsimd-13.2.0/.github/workflows/windows.yml000066400000000000000000000060511475736624100205540ustar00rootroot00000000000000name: Windows build on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} cancel-in-progress: true jobs: build: name: 'MSVC ${{ matrix.os }}, ${{ matrix.target }} ${{ matrix.sys.set }}' defaults: run: shell: bash {0} strategy: matrix: os: - 2019 - 2022 target: - x86 - x64 sys: - { set: SSE, flags: "/arch:SSE2" } - { set: AVX, flags: "/arch:AVX" } - { set: AVX2, flags: "/arch:AVX2" } - { set: AVX512, flags: "/arch:AVX512" } exclude: # AVX on both platforms has a codegen error # On 2019 in _mm256_rsqrt_ps, on 2022 in _mm256_blend_p* - { sys: { set: AVX } } # On both platforms x86 + AVX512 triggers a compiler crash - { target: x86, sys: { set: AVX512 } } # /arch:SSE2 is not available on x64 platforms (SSE2 is enabled by default) - { target: x64, sys: { set: SSE} } runs-on: windows-${{ matrix.os }} steps: - name: Setup compiler uses: ilammy/msvc-dev-cmd@v1 with: arch: ${{ matrix.target }} - name: Setup Ninja run: | python3 -m pip install --upgrade pip setuptools wheel python3 -m pip install ninja - name: Checkout xsimd uses: actions/checkout@v3 - name: Setup run: | mkdir _build cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS="${{ matrix.sys.flags }}" -G Ninja - name: Build run: | cd _build && cmake --build . - name: Testing xsimd if: ${{ !startsWith(matrix.sys.set, 'AVX512') }} run: | cd _build && ./test/test_xsimd build-windows-mingw: name: 'MSYS2 ${{ matrix.msystem }}' runs-on: windows-2019 defaults: run: shell: msys2 {0} strategy: matrix: # Temporarily remove MINGW64 and UCRT64 builds because # GCC 12 gives an unexpected overflow warning for __builtin_memmove # see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106199 # Temporarily remove CLANG32 because doctest is not available anymore # for this platform msystem: [ MINGW32, CLANG64 ] #msystem: [ MINGW32, MINGW64, UCRT64, CLANG32, CLANG64 ] fail-fast: false steps: - name: Use MinGW from MSYS2 uses: msys2/setup-msys2@v2 with: msystem: ${{ matrix.msystem }} update: true path-type: minimal pacboy: >- cc:p cmake:p ninja:p doctest:p - name: Checkout xsimd uses: actions/checkout@v2 - name: Configure run: | mkdir _build cd _build cmake .. -DBUILD_TESTS=ON -DBUILD_BENCHMARK=ON -DBUILD_EXAMPLES=ON -DCMAKE_BUILD_TYPE=Release -G Ninja - name: Build run: ninja -C _build - name: Test run: | cd _build && ./test/test_xsimd xsimd-13.2.0/.gitignore000066400000000000000000000007251475736624100147340ustar00rootroot00000000000000# Generated pkg-config files *.pc # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # Vim tmp files *.swp # Build folder build/ # Documentation build artefacts docs/CMakeCache.txt docs/xml/ docs/build/ # VSCode / clangd IntelliSense .vscode/ .cache/ # CLion / IDEA .idea/xsimd-13.2.0/CMakeLists.txt000066400000000000000000000156441475736624100155120ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # Copyright (c) Serge Guelton # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.8) project(xsimd) option(XSIMD_REFACTORING ON) set(XSIMD_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) # Versioning # ========== file(STRINGS "${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp" xsimd_version_defines REGEX "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH)") foreach(ver ${xsimd_version_defines}) if(ver MATCHES "#define XSIMD_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$") set(XSIMD_VERSION_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}" CACHE INTERNAL "") endif() endforeach() set(${PROJECT_NAME}_VERSION ${XSIMD_VERSION_MAJOR}.${XSIMD_VERSION_MINOR}.${XSIMD_VERSION_PATCH}) message(STATUS "xsimd v${${PROJECT_NAME}_VERSION}") # Build # ===== set(XSIMD_HEADERS ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_constants.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx2.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_sse.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma4.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_generic.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_isa.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_neon.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_neon64.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_rvv.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_scalar.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse2.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse3.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_1.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_2.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_ssse3.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sve.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_arch.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_cpuid.hpp ${XSIMD_INCLUDE_DIR}/xsimd/memory/xsimd_aligned_allocator.hpp ${XSIMD_INCLUDE_DIR}/xsimd/memory/xsimd_alignment.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_all_registers.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_api.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_neon64_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx2_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx512f_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_avx_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_batch.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_batch_constant.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_avx_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_avx2_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_sse_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma4_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_generic_arch.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_rvv_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse2_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse3_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse4_1_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse4_2_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_ssse3_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sve_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_traits.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_utils.hpp ${XSIMD_INCLUDE_DIR}/xsimd/xsimd.hpp ) add_library(xsimd INTERFACE) target_include_directories(xsimd INTERFACE $ $) OPTION(ENABLE_XTL_COMPLEX "enables support for xcomplex defined in xtl" OFF) OPTION(BUILD_TESTS "xsimd test suite" OFF) if(ENABLE_XTL_COMPLEX) find_package(xtl 0.7.0 REQUIRED) target_compile_features(xsimd INTERFACE cxx_std_14) target_compile_definitions(xsimd INTERFACE XSIMD_ENABLE_XTL_COMPLEX=1) target_link_libraries(xsimd INTERFACE xtl) else() target_compile_features(xsimd INTERFACE cxx_std_11) endif() if(BUILD_TESTS) enable_testing() add_subdirectory(test) endif() OPTION(BUILD_BENCHMARK "xsimd benchmarks" OFF) if(BUILD_BENCHMARK) add_subdirectory(benchmark) endif() OPTION(BUILD_EXAMPLES "xsimd examples" OFF) if(BUILD_EXAMPLES) add_subdirectory(examples) endif() # Installation # ============ OPTION(XSIMD_SKIP_INSTALL "Skip installation or not. By default it is OFF" OFF) if(${XSIMD_SKIP_INSTALL}) return() # skip installation endif () set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") include(JoinPaths) include(GNUInstallDirs) include(CMakePackageConfigHelpers) install(TARGETS xsimd EXPORT ${PROJECT_NAME}-targets) # Makes the project importable from the build directory export(EXPORT ${PROJECT_NAME}-targets FILE "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Targets.cmake") install(DIRECTORY ${XSIMD_INCLUDE_DIR}/xsimd DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". set(XSIMD_CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/cmake/${PROJECT_NAME}" CACHE STRING "install path for xsimdConfig.cmake") configure_package_config_file(${PROJECT_NAME}Config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" INSTALL_DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) # xsimd is header-only and does not depend on the architecture. # Remove CMAKE_SIZEOF_VOID_P from xtensorConfigVersion.cmake so that an xtensorConfig.cmake # generated for a 64 bit target can be used for 32 bit targets and vice versa. set(_XTENSOR_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P}) unset(CMAKE_SIZEOF_VOID_P) write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake VERSION ${${PROJECT_NAME}_VERSION} COMPATIBILITY SameMajorVersion) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) install(EXPORT ${PROJECT_NAME}-targets FILE ${PROJECT_NAME}Targets.cmake DESTINATION ${XSIMD_CMAKECONFIG_INSTALL_DIR}) configure_file(${PROJECT_NAME}.pc.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" @ONLY) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc" DESTINATION "${CMAKE_INSTALL_DATADIR}/pkgconfig/") xsimd-13.2.0/CONTRIBUTING.md000066400000000000000000000016641475736624100152000ustar00rootroot00000000000000# Contributing to xsimd First, thanks for being there! Welcome on board, we will try to make your contributing journey as good an experience as it can be. # Submitting patches Patches should be submitted through Github PR. We di put some effort to setup a decent Continuous Integration coverage, please try to make it green ;-) We use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) to keep the coding style consistent, a ``.clang-format`` file is shipped within the source, feel free to use it! # Extending the API We are open to extending the API, as long as it has been discussed either in an Issue or a PR. The only constraint is to add testing for new functions, and make sure they work on all supported architectures, not only your favorite one! # Licensing We use a shared copyright model that enables all contributors to maintain the copyright on their contributions. Stated otherwise, there's no copyright assignment. xsimd-13.2.0/Changelog.rst000066400000000000000000000204241475736624100153630ustar00rootroot00000000000000.. Copyright (c) Serge Guelton and Johan Mabille Copyright (c) QuantStack Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Changelog ========= 13.2.0 ------ * Added broadcast overload for bool * Fixed kernel::store for booleans * Explicitly verify dependency between architectures (like sse2 implies sse2) * Use default arch alignment as default alignment for xsimd::aligned_allocator * sse2 version of xsimd::swizzle on [u]int16_t * avx implementation of transpose for [u]int[8|16] * Implement [u]int8 and [u]int16 matrix transpose for 128 bit registers * Fix minor warning * Fix fma4 support 13.1.0 ------ * Fix rotate_left and rotate_right behavior (it was swapped!) * Fix compress implementation on RISC-V * Improve RISC-V CI * Fix clang-17 compilation on RISC-V * Validate cmake integration * Provide xsimd::transpose on 64 and 32 bits on most platforms * Improve documentation * Provide xsimd::batch_bool::count * Fix interaction between xsimd::make_sized_batch_t and xsimd::batch * Fix vbmi, sve and rvv detection through xsimd::available_architectures * Fix compilation on MS targets where ``small`` can be defined. * Change default install directory for installed headers. * Support mixed-complex implementations of xsimd::pow() * Improve xsimd::pow implementation for complex numbers * Fix uninitialized read in lgamma implementation 13.0.0 ------ * Most xsimd functions are flagged as always_inline * Fix some xsimd scalar version (abs, bitofsign, signbit, bitwise_cast, exp10) * Move from batch_constant, Csts...> to batch_constant * Move from batch_bool_constant, Csts...> to batch_bool_constant * Provide an as_batch() method (resp. as_batch_bool) method for batch_constant (resp. batch_bool_constant) * New architecture emulated for batches of N bits emulated using scalar operations. * Remove the version method from all architectures * Support xsimd::avg and xsimd::avgr vector operation * Model i8mm arm extension * Fix dispatching mechanism 12.1.1 ------ * Update readme with a section on adoption, and a section on the history of the project * Fix/avx512vnni implementation * Fix regression on XSIMD_NO_SUPPORTED_ARCHITECTURE 12.1.0 ------ * Fix various problems with architecture version handling * Specialize xsimd::compress for riscv * Provide stubs for various avx512xx architectures 12.0.0 ------ * Fix sincos implementation to cope with Emscripten * Upgraded minimal version of cmake to remove deprecation warning * Fixed constants::signmask for GCC when using ffast-math * Add RISC-V Vector support * Generic, simple implementation fox xsimd::compress * Disable batch of bools, and suggest using batch_bool instead * Add an option to skip installation 11.2.0 ------ * Provide shuffle operations of floating point batches * Provide a generic implementation of xsimd::swizzle with dynamic indices * Implement rotl, rotr, rotate_left and rotate_right * Let CMake figure out pkgconfig directories * Add missing boolean operators in xsimd_api.hpp * Initial Implementation for the new WASM based instruction set * Provide a generic version for float to uint32_t conversion 11.1.0 ------ * Introduce XSIMD_DEFAULT_ARCH to force default architecture (if any) * Remove C++ requirement on xsimd::exp10 scalar implementation * Improve and test documentation 11.0.0 ------ * Provide a generic reducer * Fix ``find_package(xsimd)`` for xtl enabled xsimd, reloaded * Cleanup benchmark code * Provide avx512f implementation of FMA and variant * Hexadecimal floating points are not a C++11 feature * back to slow implementation of exp10 on Windows * Changed bitwise_cast API * Provide generic signed /unsigned type conversion * Fixed sde location * Feature/incr decr * Cleanup documentation 10.0.0 ------ * Fix potential ABI issue in SVE support * Disable fast exp10 on OSX * Assert on unaligned memory when calling aligned load/store * Fix warning about uninitialized storage * Always forward arch parameter * Do not specialize the behavior of ``simd_return_type`` for char * Support broadcasting of complex batches * Make xsimd compatible with -fno-exceptions * Provide and test comparison operators overloads that accept scalars 9.0.1 ----- * Fix potential ABI issue in SVE support, making ``xsimd::sve`` a type alias to size-dependent type. 9.0.0 ----- * Support fixed size SVE * Fix a bug in SSSE3 ``xsimd::swizzle`` implementation for ``int8`` and ``int16`` * Rename ``xsimd::hadd`` into ``xsimd::reduce_add``, provide ``xsimd::reduce_min`` and ``xsimd::reduce_max`` * Properly report unsupported double for neon on arm32 * Fill holes in xsimd scalar api * Fix ``find_package(xsimd)`` for xtl enabled xsimd * Replace ``xsimd::bool_cast`` by ``xsimd::batch_bool_cast`` * Native ``xsimd::hadd`` for float on arm64 * Properly static_assert when trying to instantiate an ``xsimd::batch`` of xtl complex * Introduce ``xsimd::batch_bool::mask()`` and ``batch_bool::from_mask(...)`` * Flag some function with ``[[nodiscard]]`` * Accept both relative and absolute libdir and include dir in xsimd.pc * Implement ``xsimd::nearbyint_as_int`` for NEON * Add ``xsimd::polar`` * Speedup double -> F32/I32 gathers * Add ``xsimd::slide_left`` and ``xsimd::slide_right`` * Support integral ``xsimd::swizzles`` on AVX 8.1.0 ----- * Add ``xsimd::gather`` and ``xsimd::scatter`` * Add ``xsimd::nearbyint_as_int`` * Add ``xsimd::none`` * Add ``xsimd::reciprocal`` * Remove batch constructor from memory adress, use ``xsimd::batch<...>::load_(un)aligned`` instead * Leave to msvc users the opportunity to manually disable FMA3 on AVX * Provide ``xsimd::insert`` to modify a single value from a vector * Make ``xsimd::pow`` implementation resilient to ``FE_INVALID`` * Reciprocal square root support through ``xsimd::rsqrt`` * NEON: Improve ``xsimd::any`` and ``xsimd::all`` * Provide type utility to explicitly require a batch of given size and type * Implement ``xsimd::swizzle`` on x86, neon and neon64 * Avx support for ``xsimd::zip_lo`` and ``xsimd::zip_hi`` * Only use ``_mm256_unpacklo_epi`` on AVX2 * Provide neon/neon64 conversion function from ``uint(32|64)_t`` to ``(float|double)`` * Provide SSE/AVX/AVX2 conversion function from ``uint32_t`` to ``float`` * Provide AVX2 conversion function from ``(u)int64_t`` to ``double`` * Provide better SSE conversion function from ``uint64_t`` to ``double`` * Provide better SSE conversion function to ``double`` * Support logical xor for ``xsimd::batch_bool`` * Clarify fma support: - FMA3 + SSE -> ``xsimd::fma3`` - FMA3 + AVX -> ``xsimd::fma3`` - FMA3 + AVX2 -> ``xsimd::fma3`` - FMA4 -> ``xsimd::fma4`` * Allow ``xsimd::transform`` to work with complex types * Add missing scalar version of ``xsimd::norm`` and ``xsimd::conj`` 8.0.5 ----- * Fix neon ``xsimd::hadd`` implementation * Detect unsupported architectures and set ``XSIMD_NO_SUPPORTED_ARCHITECTURE`` if needs be 8.0.4 ----- * Provide some conversion operators for ``float`` -> ``uint32`` * Improve code generated for AVX2 signed integer comparisons * Enable detection of avx512cd and avx512dq, and fix avx512bw detection * Enable detection of AVX2+FMA * Pick the best compatible architecture in ``xsimd::dispatch`` * Enables support for FMA when AVX2 is detected on Windows * Add missing includes / forward declaration * Mark all functions inline and noexcept * Assert when using incomplete ``std::initializer_list`` 8.0.3 ----- * Improve CI & testing, no functional change 8.0.2 ----- * Do not use ``_mm256_srai_epi32`` under AVX, it's an AVX2 instruction 8.0.1 ----- * Fix invalid constexpr ``std::make_tuple`` usage in neon64 xsimd-13.2.0/LICENSE000066400000000000000000000031061475736624100137450ustar00rootroot00000000000000Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou Copyright (c) 2016, QuantStack Copyright (c) 2018, Serge Guelton All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsimd-13.2.0/README.md000066400000000000000000000207331475736624100142240ustar00rootroot00000000000000# ![xsimd](docs/source/xsimd.svg) [![GHA android](https://github.com/xtensor-stack/xsimd/actions/workflows/android.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/android.yml) [![GHA cross-rvv](https://github.com/xtensor-stack/xsimd/actions/workflows/cross-rvv.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/cross-rvv.yml) [![GHA cross-sve](https://github.com/xtensor-stack/xsimd/actions/workflows/cross-sve.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/cross-sve.yml) [![GHA cross](https://github.com/xtensor-stack/xsimd/actions/workflows/cross.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/cross.yml) [![GHA cxx-no-exceptions](https://github.com/xtensor-stack/xsimd/actions/workflows/cxx-no-exceptions.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/cxx-no-exceptions.yml) [![GHA cxx-versions](https://github.com/xtensor-stack/xsimd/actions/workflows/cxx-versions.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/cxx-versions.yml) [![GHA emscripten](https://github.com/xtensor-stack/xsimd/actions/workflows/emscripten.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/emscripten.yml) [![GHA linux](https://github.com/xtensor-stack/xsimd/actions/workflows/linux.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/linux.yml) [![GHA macos](https://github.com/xtensor-stack/xsimd/actions/workflows/macos.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/macos.yml) [![GHA windows](https://github.com/xtensor-stack/xsimd/actions/workflows/windows.yml/badge.svg)](https://github.com/xtensor-stack/xsimd/actions/workflows/windows.yml) [![Documentation Status](http://readthedocs.org/projects/xsimd/badge/?version=latest)](https://xsimd.readthedocs.io/en/latest/?badge=latest) [![Join the Gitter Chat](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/QuantStack/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) C++ wrappers for SIMD intrinsics ## Introduction SIMD (Single Instruction, Multiple Data) is a feature of microprocessors that has been available for many years. SIMD instructions perform a single operation on a batch of values at once, and thus provide a way to significantly accelerate code execution. However, these instructions differ between microprocessor vendors and compilers. `xsimd` provides a unified means for using these features for library authors. Namely, it enables manipulation of batches of numbers with the same arithmetic operators as for single values. It also provides accelerated implementation of common mathematical functions operating on batches. ## Adoption Beyond Xtensor, Xsimd has been adopted by major open-source projects, such as Mozilla Firefox, Apache Arrow, Pythran, and Krita. ## History The XSimd project started with a series of blog articles by Johan Mabille on how to implement wrappers for SIMD intrinsicts. The archives of the blog can be found here: [The C++ Scientist](http://johanmabille.github.io/blog/archives/). The design described in the articles remained close to the actual architecture of XSimd up until Version 8.0. The mathematical functions are a lightweight implementation of the algorithms originally implemented in the now deprecated [boost.SIMD](https://github.com/NumScale/boost.simd) project. ## Requirements `xsimd` requires a C++11 compliant compiler. The following C++ compilers are supported: Compiler | Version ------------------------|------------------------------- Microsoft Visual Studio | MSVC 2015 update 2 and above g++ | 4.9 and above clang | 4.0 and above The following SIMD instruction set extensions are supported: Architecture | Instruction set extensions -------------|----------------------------------------------------- x86 | SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, FMA3+SSE, FMA3+AVX, FMA3+AVX2 x86 | AVX512BW, AVX512CD, AVX512DQ, AVX512F (gcc7 and higher) x86 AMD | FMA4 ARM | NEON, NEON64, SVE128/256/512 (fixed vector size) WebAssembly | WASM RISC-V | RISC-V128/256/512 (fixed vector size) ## Installation ### Install from conda-forge A package for xsimd is available on the mamba (or conda) package manager. ```bash mamba install -c conda-forge xsimd ``` ### Install with Spack A package for xsimd is available on the Spack package manager. ```bash spack install xsimd spack load xsimd ``` ### Install from sources You can directly install it from the sources with cmake: ```bash cmake -D CMAKE_INSTALL_PREFIX=your_install_prefix . make install ``` ## Documentation To get started with using `xsimd`, check out the full documentation http://xsimd.readthedocs.io/ ## Dependencies `xsimd` has an optional dependency on the [xtl](https://github.com/xtensor-stack/xtl) library: | `xsimd` | `xtl` (optional) | |---------|------------------| | master | ^0.7.0 | | 12.x | ^0.7.0 | | 11.x | ^0.7.0 | | 10.x | ^0.7.0 | | 9.x | ^0.7.0 | | 8.x | ^0.7.0 | The dependency on `xtl` is required if you want to support vectorization for `xtl::xcomplex`. In this case, you must build your project with C++14 support enabled. ## Usage The version 8 of the library is a complete rewrite and there are some slight differences with 7.x versions. A migration guide will be available soon. In the meanwhile, the following examples show how to use both versions 7 and 8 of the library? ### Explicit use of an instruction set extension Here is an example that computes the mean of two sets of 4 double floating point values, assuming AVX extension is supported: ```cpp #include #include "xsimd/xsimd.hpp" namespace xs = xsimd; int main(int argc, char* argv[]) { xs::batch a = {1.5, 2.5, 3.5, 4.5}; xs::batch b = {2.5, 3.5, 4.5, 5.5}; auto mean = (a + b) / 2; std::cout << mean << std::endl; return 0; } ``` Do not forget to enable AVX extension when building the example. With gcc or clang, this is done with the `-mavx` flag, on MSVC you have to pass the `/arch:AVX` option. This example outputs: ```cpp (2.0, 3.0, 4.0, 5.0) ``` ### Auto detection of the instruction set extension to be used The same computation operating on vectors and using the most performant instruction set available: ```cpp #include #include #include "xsimd/xsimd.hpp" namespace xs = xsimd; using vector_type = std::vector>; void mean(const vector_type& a, const vector_type& b, vector_type& res) { std::size_t size = a.size(); constexpr std::size_t simd_size = xsimd::simd_type::size; std::size_t vec_size = size - size % simd_size; for(std::size_t i = 0; i < vec_size; i += simd_size) { auto ba = xs::load_aligned(&a[i]); auto bb = xs::load_aligned(&b[i]); auto bres = (ba + bb) / 2.; bres.store_aligned(&res[i]); } for(std::size_t i = vec_size; i < size; ++i) { res[i] = (a[i] + b[i]) / 2.; } } ``` ## Building and Running the Tests Building the tests requires [cmake](https://cmake.org). `cmake` is available as a package for most linux distributions. Besides, they can also be installed with the `conda` package manager (even on windows): ```bash conda install -c conda-forge cmake ``` Once `cmake` is installed, you can build and run the tests: ```bash mkdir build cd build cmake ../ -DBUILD_TESTS=ON make xtest ``` In the context of continuous integration with Travis CI, tests are run in a `conda` environment, which can be activated with ```bash cd test conda env create -f ./test-environment.yml source activate test-xsimd cd .. cmake . -DBUILD_TESTS=ON make xtest ``` ## Building the HTML Documentation xsimd's documentation is built with three tools - [doxygen](http://www.doxygen.org) - [sphinx](http://www.sphinx-doc.org) - [breathe](https://breathe.readthedocs.io) While doxygen must be installed separately, you can install breathe by typing ```bash pip install breathe ``` Breathe can also be installed with `conda` ```bash conda install -c conda-forge breathe ``` Finally, build the documentation with ```bash make html ``` from the `docs` subdirectory. ## License We use a shared copyright model that enables all contributors to maintain the copyright on their contributions. This software is licensed under the BSD-3-Clause license. See the [LICENSE](LICENSE) file for details. xsimd-13.2.0/benchmark/000077500000000000000000000000001475736624100146725ustar00rootroot00000000000000xsimd-13.2.0/benchmark/CMakeLists.txt000066400000000000000000000062641475736624100174420ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # Copyright (c) Serge Guelton # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.1) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-benchmark) find_package(xsimd REQUIRED CONFIG) set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIRS}) endif () if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting tests build type to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) else() message(STATUS "Tests build type is ${CMAKE_BUILD_TYPE}") endif() include(CheckCXXCompilerFlag) string(TOUPPER "${CMAKE_BUILD_TYPE}" U_CMAKE_BUILD_TYPE) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") endif() if(NOT MSVC) CHECK_CXX_COMPILER_FLAG("-std=c++11" HAS_CPP11_FLAG) if (ENABLE_XTL_COMPLEX) CHECK_CXX_COMPILER_FLAG("-std=c++14" HAS_CPP14_FLAG) if (NOT HAS_CPP14_FLAG) message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++14 support when xtl complex support is enabled") endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") else() CHECK_CXX_COMPILER_FLAG("-std=c++11" HAS_CPP11_FLAG) if (NOT HAS_CPP11_FLAG) message(FATAL_ERROR "Unsupported compiler -- xsimd requires C++11 support!") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") endif() endif() endif() endif() if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc /MP /bigobj") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") set(CMAKE_EXE_LINKER_FLAGS /MANIFEST:NO) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) string(REPLACE "/MD" "-MT" ${flag_var} "${${flag_var}}") endforeach() endif() include_directories(${XSIMD_INCLUDE_DIR}) set(XSIMD_BENCHMARK main.cpp xsimd_benchmark.hpp ) set(XSIMD_BENCHMARK_TARGET benchmark_xsimd) add_executable(${XSIMD_BENCHMARK_TARGET} ${XSIMD_BENCHMARK} ${XSIMD_HEADERS}) if(ENABLE_XTL_COMPLEX) target_link_libraries(benchmark_xsimd PRIVATE xtl) endif() add_custom_target(xbenchmark COMMAND benchmark_xsimd DEPENDS ${XSIMD_BENCHMARK_TARGET}) xsimd-13.2.0/benchmark/main.cpp000066400000000000000000000155201475736624100163250ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #include "xsimd_benchmark.hpp" #include void benchmark_operation() { // std::size_t size = 9984; std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::add_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::sub_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::mul_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::div_fn(), std::cout, size, 1000); } void benchmark_exp_log() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::exp_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::exp2_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::expm1_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log2_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::log10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::log1p_fn(), std::cout, size, 1000); } void benchmark_trigo() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::sin_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cos_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::tan_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::asin_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); xsimd::run_benchmark_1op(xsimd::acos_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); xsimd::run_benchmark_1op(xsimd::atan_fn(), std::cout, size, 1000, xsimd::init_method::arctrigo); } void benchmark_hyperbolic() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::sinh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cosh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::tanh_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::asinh_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::acosh_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::atanh_fn(), std::cout, size, 100); } void benchmark_power() { std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::pow_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::sqrt_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::cbrt_fn(), std::cout, size, 100); xsimd::run_benchmark_2op(xsimd::hypot_fn(), std::cout, size, 1000); } void benchmark_rounding() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::ceil_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::floor_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::trunc_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::round_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::nearbyint_fn(), std::cout, size, 100); xsimd::run_benchmark_1op(xsimd::rint_fn(), std::cout, size, 100); } #ifdef XSIMD_POLY_BENCHMARKS void benchmark_poly_evaluation() { std::size_t size = 20000; xsimd::run_benchmark_1op(xsimd::horner_5_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_5_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_10_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_12_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_12_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_14_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_14_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::horner_16_fn(), std::cout, size, 1000); xsimd::run_benchmark_1op(xsimd::estrin_16_fn(), std::cout, size, 1000); } #endif void benchmark_basic_math() { std::size_t size = 20000; xsimd::run_benchmark_2op(xsimd::fmod_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::remainder_fn(), std::cout, size, 1000); xsimd::run_benchmark_2op(xsimd::fdim_fn(), std::cout, size, 1000); xsimd::run_benchmark_3op(xsimd::clip_fn(), std::cout, size, 1000); #if 0 xsimd::run_benchmark_1op_pred(xsimd::isfinite_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::isinf_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_flint_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_odd_fn(), std::cout, size, 100); xsimd::run_benchmark_1op_pred(xsimd::is_even_fn(), std::cout, size, 100); #endif } int main(int argc, char* argv[]) { const std::map> fn_map = { { "op", { "arithmetic", benchmark_operation } }, { "exp", { "exponential and logarithm", benchmark_exp_log } }, { "trigo", { "trigonometric", benchmark_trigo } }, { "hyperbolic", { "hyperbolic", benchmark_hyperbolic } }, { "power", { "power", benchmark_power } }, { "basic_math", { "basic math", benchmark_basic_math } }, { "rounding", { "rounding", benchmark_rounding } }, #ifdef XSIMD_POLY_BENCHMARKS { "utils", { "polynomial evaluation", benchmark_poly_evaluation } }, #endif }; if (argc > 1) { if (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h") { std::cout << "Available options:" << std::endl; for (auto const& kv : fn_map) { std::cout << kv.first << ": run benchmark on " << kv.second.first << " functions" << std::endl; } } else { std::cout << "############################" << std::endl << "# " << xsimd::default_arch::name() << std::endl << "############################" << std::endl; for (int i = 1; i < argc; ++i) { fn_map.at(argv[i]).second(); } } } else { std::cout << "############################" << std::endl << "# " << xsimd::default_arch::name() << std::endl << "############################" << std::endl; for (auto const& kv : fn_map) { kv.second.second(); } } return 0; } xsimd-13.2.0/benchmark/xsimd_benchmark.hpp000066400000000000000000000557771475736624100205660ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_BENCHMARK_HPP #define XSIMD_BENCHMARK_HPP #include "xsimd/arch/xsimd_scalar.hpp" #include "xsimd/xsimd.hpp" #include #include #include #include namespace xsimd { using duration_type = std::chrono::duration; template using bench_vector = std::vector>; template void init_benchmark(bench_vector& lhs, bench_vector& rhs, bench_vector& res, size_t size) { lhs.resize(size); rhs.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { lhs[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size); rhs[i] = T(10.2) / T(i + 2) + T(0.25); } } template void init_benchmark(bench_vector& op0, bench_vector& op1, bench_vector& op2, bench_vector& res, size_t size) { op0.resize(size); op1.resize(size); op2.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { op0[i] = T(0.5) + std::sqrt(T(i)) * T(9.) / T(size); op1[i] = T(10.2) / T(i + 3) + T(0.25); op2[i] = T(20.1) / T(i + 2) + T(0.65); } } template void init_benchmark_arctrigo(bench_vector& lhs, bench_vector& rhs, bench_vector& res, size_t size) { lhs.resize(size); rhs.resize(size); res.resize(size); for (size_t i = 0; i < size; ++i) { lhs[i] = T(-1.) + T(2.) * T(i) / T(size); rhs[i] = T(i) / T(i + 2) + T(0.25); } } enum class init_method { classic, arctrigo }; template duration_type benchmark_scalar(F f, V& lhs, V& res, std::size_t number) { size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(lhs[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_scalar(F f, V& lhs, V& rhs, V& res, std::size_t number) { size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(lhs[i], rhs[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_scalar(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { size_t s = op0.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (size_t i = 0; i < s; ++i) { res[i] = f(op0[i], op1[i], op2[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& lhs, V& res, std::size_t number) { std::size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B blhs = B::load_aligned(&lhs[i]); B bres = f(blhs); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& lhs, V& res, std::size_t number) { std::size_t s = lhs.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B blhs = B::load_aligned(&lhs[i]), blhs2 = B::load_aligned(&lhs[j]), blhs3 = B::load_aligned(&lhs[k]), blhs4 = B::load_aligned(&lhs[l]); B bres = f(blhs); B bres2 = f(blhs2); B bres3 = f(blhs3); B bres4 = f(blhs4); bres.store_aligned(&res[i]); bres2.store_aligned(&res[j]); bres3.store_aligned(&res[k]); bres4.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& lhs, V& rhs, V& res, std::size_t number) { std::size_t s = lhs.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B blhs = B::load_aligned(&lhs[i]), brhs = B::load_aligned(&rhs[i]); B bres = f(blhs, brhs); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& lhs, V& rhs, V& res, std::size_t number) { std::size_t s = lhs.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B blhs = B::load_aligned(&lhs[i]), brhs = B::load_aligned(&rhs[i]), blhs2 = B::load_aligned(&lhs[j]), brhs2 = B::load_aligned(&rhs[j]); B blhs3 = B::load_aligned(&lhs[k]), brhs3 = B::load_aligned(&rhs[k]), blhs4 = B::load_aligned(&lhs[l]), brhs4 = B::load_aligned(&rhs[l]); B bres = f(blhs, brhs); B bres2 = f(blhs2, brhs2); B bres3 = f(blhs3, brhs3); B bres4 = f(blhs4, brhs4); bres.store_aligned(&res[i]); bres2.store_aligned(&res[j]); bres3.store_aligned(&res[k]); bres4.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { std::size_t s = op0.size(); duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - B::size); i += B::size) { B bop0 = B::load_aligned(&op0[i]), bop1 = B::load_aligned(&op1[i]), bop2 = B::load_aligned(&op2[i]); B bres = f(bop0, bop1, bop2); bres.store_aligned(&res[i]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template duration_type benchmark_simd_unrolled(F f, V& op0, V& op1, V& op2, V& res, std::size_t number) { std::size_t s = op0.size(); std::size_t inc = 4 * B::size; duration_type t_res = duration_type::max(); for (std::size_t count = 0; count < number; ++count) { auto start = std::chrono::steady_clock::now(); for (std::size_t i = 0; i <= (s - inc); i += inc) { size_t j = i + B::size; size_t k = j + B::size; size_t l = k + B::size; B bop0_i = B::load_aligned(&op0[i]), bop1_i = B::load_aligned(&op1[i]), bop2_i = B::load_aligned(&op2[i]); B bop0_j = B::load_aligned(&op0[j]), bop1_j = B::load_aligned(&op1[j]), bop2_j = B::load_aligned(&op2[j]); B bop0_k = B::load_aligned(&op0[k]), bop1_k = B::load_aligned(&op1[k]), bop2_k = B::load_aligned(&op2[k]); B bop0_l = B::load_aligned(&op0[l]), bop1_l = B::load_aligned(&op1[l]), bop2_l = B::load_aligned(&op2[l]); B bres_i = f(bop0_i, bop1_i, bop2_i); B bres_j = f(bop0_j, bop1_j, bop2_j); B bres_k = f(bop0_k, bop1_k, bop2_k); B bres_l = f(bop0_l, bop1_l, bop2_l); bres_i.store_aligned(&res[i]); bres_j.store_aligned(&res[j]); bres_k.store_aligned(&res[k]); bres_l.store_aligned(&res[l]); } auto end = std::chrono::steady_clock::now(); auto tmp = end - start; t_res = tmp < t_res ? tmp : t_res; } return t_res; } template void run_benchmark_1op(F f, OS& out, std::size_t size, std::size_t iter, init_method init = init_method::classic) { bench_vector f_lhs, f_rhs, f_res; bench_vector d_lhs, d_rhs, d_res; switch (init) { case init_method::classic: init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); break; case init_method::arctrigo: init_benchmark_arctrigo(f_lhs, f_rhs, f_res, size); init_benchmark_arctrigo(d_lhs, d_rhs, d_res, size); break; default: init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); break; } #ifndef XSIMD_POLY_BENCHMARKS duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_res, iter); duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_res, iter); #endif duration_type t_float_vector = benchmark_simd>(f, f_lhs, f_res, iter); duration_type t_float_vector_u = benchmark_simd_unrolled>(f, f_lhs, f_res, iter); #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 duration_type t_double_vector = benchmark_simd>(f, d_lhs, d_res, iter); duration_type t_double_vector_u = benchmark_simd_unrolled>(f, d_lhs, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; #ifndef XSIMD_POLY_BENCHMARKS out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; #endif out << "vector float : " << t_float_vector.count() << "ms" << std::endl; out << "vector float unr : " << t_float_vector_u.count() << "ms" << std::endl; #ifndef XSIMD_POLY_BENCHMARKS out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #endif #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 out << "vector double : " << t_double_vector.count() << "ms" << std::endl; out << "vector double unr : " << t_double_vector_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } template void run_benchmark_2op(F f, OS& out, std::size_t size, std::size_t iter) { bench_vector f_lhs, f_rhs, f_res; bench_vector d_lhs, d_rhs, d_res; init_benchmark(f_lhs, f_rhs, f_res, size); init_benchmark(d_lhs, d_rhs, d_res, size); duration_type t_float_scalar = benchmark_scalar(f, f_lhs, f_rhs, f_res, iter); duration_type t_float_vector = benchmark_simd>(f, f_lhs, f_rhs, f_res, iter); duration_type t_float_vector_u = benchmark_simd_unrolled>(f, f_lhs, f_rhs, f_res, iter); duration_type t_double_scalar = benchmark_scalar(f, d_lhs, d_rhs, d_res, iter); #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 duration_type t_double_vector = benchmark_simd>(f, d_lhs, d_rhs, d_res, iter); duration_type t_double_vector_u = benchmark_simd_unrolled>(f, d_lhs, d_rhs, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; out << "vector float : " << t_float_vector.count() << "ms" << std::endl; out << "vector float unr : " << t_float_vector_u.count() << "ms" << std::endl; out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 out << "vector double : " << t_double_vector.count() << "ms" << std::endl; out << "vector double unr : " << t_double_vector_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } template void run_benchmark_3op(F f, OS& out, std::size_t size, std::size_t iter) { bench_vector f_op0, f_op1, f_op2, f_res; bench_vector d_op0, d_op1, d_op2, d_res; init_benchmark(f_op0, f_op1, f_op2, f_res, size); init_benchmark(d_op0, d_op1, d_op2, d_res, size); duration_type t_float_scalar = benchmark_scalar(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_float_vector = benchmark_simd>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_float_vector_u = benchmark_simd_unrolled>(f, f_op0, f_op1, f_op2, f_res, iter); duration_type t_double_scalar = benchmark_scalar(f, d_op0, d_op1, d_op2, d_res, iter); #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 duration_type t_double_vector = benchmark_simd>(f, d_op0, d_op1, d_op2, d_res, iter); duration_type t_double_vector_u = benchmark_simd_unrolled>(f, d_op0, d_op1, d_op2, d_res, iter); #endif out << "============================" << std::endl; out << f.name() << std::endl; out << "scalar float : " << t_float_scalar.count() << "ms" << std::endl; out << "vector float : " << t_float_vector.count() << "ms" << std::endl; out << "vector float unr : " << t_float_vector_u.count() << "ms" << std::endl; out << "scalar double : " << t_double_scalar.count() << "ms" << std::endl; #if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 out << "vector double : " << t_double_vector.count() << "ms" << std::endl; out << "vector double unr : " << t_double_vector_u.count() << "ms" << std::endl; #endif out << "============================" << std::endl; } #define DEFINE_OP_FUNCTOR_2OP(OP, NAME) \ struct NAME##_fn \ { \ template \ inline T operator()(const T& lhs, const T& rhs) const \ { \ return lhs OP rhs; \ } \ inline std::string name() const \ { \ return #NAME; \ } \ } #define DEFINE_FUNCTOR_1OP(FN) \ struct FN##_fn \ { \ template \ inline T operator()(const T& x) const \ { \ using xsimd::FN; \ return FN(x); \ } \ inline std::string name() const \ { \ return #FN; \ } \ } #define DEFINE_FUNCTOR_1OP_TEMPLATE(NAME, FN, N, ...) \ struct NAME##_##N##_fn \ { \ template \ inline T operator()(const T& x) const \ { \ using xsimd::FN; \ return FN(x); \ } \ inline std::string name() const \ { \ return #FN " " #N; \ } \ } #define DEFINE_FUNCTOR_2OP(FN) \ struct FN##_fn \ { \ template \ inline T operator()(const T& lhs, const T& rhs) const \ { \ using xsimd::FN; \ return FN(lhs, rhs); \ } \ inline std::string name() const \ { \ return #FN; \ } \ } #define DEFINE_FUNCTOR_3OP(FN) \ struct FN##_fn \ { \ template \ inline T operator()(const T& op0, const T& op1, const T& op2) const \ { \ using xsimd::FN; \ return FN(op0, op1, op2); \ } \ inline std::string name() const \ { \ return #FN; \ } \ } DEFINE_OP_FUNCTOR_2OP(+, add); DEFINE_OP_FUNCTOR_2OP(-, sub); DEFINE_OP_FUNCTOR_2OP(*, mul); DEFINE_OP_FUNCTOR_2OP(/, div); DEFINE_FUNCTOR_1OP(exp); DEFINE_FUNCTOR_1OP(exp2); DEFINE_FUNCTOR_1OP(expm1); DEFINE_FUNCTOR_1OP(log); DEFINE_FUNCTOR_1OP(log10); DEFINE_FUNCTOR_1OP(log2); DEFINE_FUNCTOR_1OP(log1p); DEFINE_FUNCTOR_1OP(sin); DEFINE_FUNCTOR_1OP(cos); DEFINE_FUNCTOR_1OP(tan); DEFINE_FUNCTOR_1OP(asin); DEFINE_FUNCTOR_1OP(acos); DEFINE_FUNCTOR_1OP(atan); DEFINE_FUNCTOR_1OP(sinh); DEFINE_FUNCTOR_1OP(cosh); DEFINE_FUNCTOR_1OP(tanh); DEFINE_FUNCTOR_1OP(asinh); DEFINE_FUNCTOR_1OP(acosh); DEFINE_FUNCTOR_1OP(atanh); DEFINE_FUNCTOR_2OP(pow); DEFINE_FUNCTOR_1OP(sqrt); DEFINE_FUNCTOR_1OP(cbrt); DEFINE_FUNCTOR_2OP(hypot); DEFINE_FUNCTOR_1OP(ceil); DEFINE_FUNCTOR_1OP(floor); DEFINE_FUNCTOR_1OP(trunc); DEFINE_FUNCTOR_1OP(round); DEFINE_FUNCTOR_1OP(nearbyint); DEFINE_FUNCTOR_1OP(rint); DEFINE_FUNCTOR_2OP(fmod); DEFINE_FUNCTOR_2OP(remainder); DEFINE_FUNCTOR_2OP(fdim); DEFINE_FUNCTOR_3OP(clip); #if 0 DEFINE_FUNCTOR_1OP(isfinite); DEFINE_FUNCTOR_1OP(isinf); DEFINE_FUNCTOR_1OP(is_flint); DEFINE_FUNCTOR_1OP(is_odd); DEFINE_FUNCTOR_1OP(is_even); #endif DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 5, 1, 2, 3, 4, 5); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 5, 1, 2, 3, 4, 5); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14); DEFINE_FUNCTOR_1OP_TEMPLATE(horner, kernel::horner, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); DEFINE_FUNCTOR_1OP_TEMPLATE(estrin, kernel::estrin, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); } #endif xsimd-13.2.0/cmake/000077500000000000000000000000001475736624100140205ustar00rootroot00000000000000xsimd-13.2.0/cmake/JoinPaths.cmake000066400000000000000000000016771475736624100167340ustar00rootroot00000000000000# This module provides function for joining paths # known from from most languages # # Original license: # SPDX-License-Identifier: (MIT OR CC0-1.0) # Explicit permission given to distribute this module under # the terms of the project as described in /LICENSE.rst. # Copyright 2020 Jan Tojnar # https://github.com/jtojnar/cmake-snips # # Modelled after Python’s os.path.join # https://docs.python.org/3.7/library/os.path.html#os.path.join # Windows not supported function(join_paths joined_path first_path_segment) set(temp_path "${first_path_segment}") foreach(current_segment IN LISTS ARGN) if(NOT ("${current_segment}" STREQUAL "")) if(IS_ABSOLUTE "${current_segment}") set(temp_path "${current_segment}") else() set(temp_path "${temp_path}/${current_segment}") endif() endif() endforeach() set(${joined_path} "${temp_path}" PARENT_SCOPE) endfunction() xsimd-13.2.0/docs/000077500000000000000000000000001475736624100136705ustar00rootroot00000000000000xsimd-13.2.0/docs/Doxyfile000066400000000000000000000045121475736624100154000ustar00rootroot00000000000000PROJECT_NAME = "xsimd" XML_OUTPUT = xml INPUT = ../include/xsimd/types/xsimd_api.hpp \ ../include/xsimd/types/xsimd_batch.hpp \ ../include/xsimd/types/xsimd_batch_constant.hpp \ ../include/xsimd/config/xsimd_arch.hpp \ ../include/xsimd/config/xsimd_config.hpp \ ../include/xsimd/memory/xsimd_alignment.hpp \ ../include/xsimd/memory/xsimd_aligned_allocator.hpp \ ../include/xsimd/types/xsimd_generic_arch.hpp \ ../include/xsimd/types/xsimd_traits.hpp \ ../include/xsimd/types/xsimd_avx2_register.hpp \ ../include/xsimd/types/xsimd_avx512bw_register.hpp \ ../include/xsimd/types/xsimd_avx512cd_register.hpp \ ../include/xsimd/types/xsimd_avx512dq_register.hpp \ ../include/xsimd/types/xsimd_avx512f_register.hpp \ ../include/xsimd/types/xsimd_avx_register.hpp \ ../include/xsimd/types/xsimd_fma3_avx_register.hpp \ ../include/xsimd/types/xsimd_fma3_avx2_register.hpp \ ../include/xsimd/types/xsimd_fma3_sse_register.hpp \ ../include/xsimd/types/xsimd_fma4_register.hpp \ ../include/xsimd/types/xsimd_neon64_register.hpp \ ../include/xsimd/types/xsimd_neon_register.hpp \ ../include/xsimd/types/xsimd_rvv_register.hpp \ ../include/xsimd/types/xsimd_sse2_register.hpp \ ../include/xsimd/types/xsimd_sse3_register.hpp \ ../include/xsimd/types/xsimd_sse4_1_register.hpp \ ../include/xsimd/types/xsimd_sse4_2_register.hpp \ ../include/xsimd/types/xsimd_ssse3_register.hpp \ ../include/xsimd/types/xsimd_sve_register.hpp GENERATE_LATEX = NO GENERATE_MAN = NO GENERATE_RTF = NO CASE_SENSE_NAMES = NO GENERATE_HTML = NO GENERATE_XML = YES RECURSIVE = YES QUIET = YES JAVADOC_AUTOBRIEF = YES WARN_IF_UNDOCUMENTED = NO WARN_AS_ERROR = NO ENABLE_PREPROCESSING = YES MACRO_EXPANSION = YES EXPAND_ONLY_PREDEF = YES PREDEFINED = XSIMD_NO_DISCARD= XSIMD_INLINE=inline xsimd-13.2.0/docs/Makefile000066400000000000000000000147261475736624100153420ustar00rootroot00000000000000# You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build PAPER = BUILDDIR = build # User-friendly check for sphinx-build ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext api default: html help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @echo " pickle to make pickle files" @echo " json to make JSON files" @echo " htmlhelp to make HTML files and a HTML help project" @echo " qthelp to make HTML files and a qthelp project" @echo " applehelp to make an Apple Help Book" @echo " devhelp to make HTML files and a Devhelp project" @echo " epub to make an epub" @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" @echo " latexpdf to make LaTeX files and run them through pdflatex" @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" @echo " text to make text files" @echo " man to make manual pages" @echo " texinfo to make Texinfo files" @echo " info to make Texinfo files and run them through makeinfo" @echo " gettext to make PO message catalogs" @echo " changes to make an overview of all changed/added/deprecated items" @echo " xml to make Docutils-native XML files" @echo " pseudoxml to make pseudoxml-XML files for display purposes" @echo " linkcheck to check all external links for integrity" @echo " doctest to run all doctests embedded in the documentation (if enabled)" @echo " coverage to run coverage check of the documentation (if enabled)" clean: rm -rf $(BUILDDIR)/* html: doxygen $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." dirhtml: doxygen $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." singlehtml: doxygen $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml @echo @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." pickle: doxygen $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle @echo @echo "Build finished; now you can process the pickle files." json: doxygen $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json @echo @echo "Build finished; now you can process the JSON files." htmlhelp: doxygen $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp @echo @echo "Build finished; now you can run HTML Help Workshop with the" \ ".hhp project file in $(BUILDDIR)/htmlhelp." epub: doxygen $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub @echo @echo "Build finished. The epub file is in $(BUILDDIR)/epub." latex: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." @echo "Run \`make' in that directory to run these through (pdf)latex" \ "(use \`make latexpdf' here to do that automatically)." latexpdf: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through pdflatex..." $(MAKE) -C $(BUILDDIR)/latex all-pdf @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." latexpdfja: doxygen $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex @echo "Running LaTeX files through platex and dvipdfmx..." $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." text: doxygen $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text @echo @echo "Build finished. The text files are in $(BUILDDIR)/text." man: doxygen $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man @echo @echo "Build finished. The manual pages are in $(BUILDDIR)/man." texinfo: doxygen $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." @echo "Run \`make' in that directory to run these through makeinfo" \ "(use \`make info' here to do that automatically)." info: doxygen $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo @echo "Running Texinfo files through makeinfo..." make -C $(BUILDDIR)/texinfo info @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." gettext: doxygen $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale @echo @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." changes: doxygen $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes @echo @echo "The overview file is in $(BUILDDIR)/changes." linkcheck: doxygen $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck @echo @echo "Link check complete; look for any errors in the above output " \ "or in $(BUILDDIR)/linkcheck/output.txt." doctest: doxygen $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest @echo "Testing of doctests in the sources finished, look at the " \ "results in $(BUILDDIR)/doctest/output.txt." coverage: doxygen $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage @echo "Testing of coverage in the sources finished, look at the " \ "results in $(BUILDDIR)/coverage/python.txt." xml: doxygen $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml @echo @echo "Build finished. The XML files are in $(BUILDDIR)/xml." pseudoxml: doxygen $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml @echo @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." xsimd-13.2.0/docs/environment.yml000066400000000000000000000001531475736624100167560ustar00rootroot00000000000000name: xsimd-docs channels: - conda-forge dependencies: - breathe - sphinx_rtd_theme - sphinx=6.* xsimd-13.2.0/docs/make.bat000066400000000000000000000161651475736624100153060ustar00rootroot00000000000000@ECHO OFF REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set BUILDDIR=build set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source set I18NSPHINXOPTS=%SPHINXOPTS% source if NOT "%PAPER%" == "" ( set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% ) if "%1" == "" goto help if "%1" == "help" ( :help echo.Please use `make ^` where ^ is one of echo. html to make standalone HTML files echo. dirhtml to make HTML files named index.html in directories echo. singlehtml to make a single large HTML file echo. pickle to make pickle files echo. json to make JSON files echo. htmlhelp to make HTML files and a HTML help project echo. qthelp to make HTML files and a qthelp project echo. devhelp to make HTML files and a Devhelp project echo. epub to make an epub echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter echo. text to make text files echo. man to make manual pages echo. texinfo to make Texinfo files echo. gettext to make PO message catalogs echo. changes to make an overview over all changed/added/deprecated items echo. xml to make Docutils-native XML files echo. pseudoxml to make pseudoxml-XML files for display purposes echo. linkcheck to check all external links for integrity echo. doctest to run all doctests embedded in the documentation if enabled echo. coverage to run coverage check of the documentation if enabled goto end ) if "%1" == "clean" ( for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i del /q /s %BUILDDIR%\* goto end ) REM Check if sphinx-build is available and fallback to Python version if any %SPHINXBUILD% 1>NUL 2>NUL if errorlevel 9009 goto sphinx_python goto sphinx_ok :sphinx_python set SPHINXBUILD=python -m sphinx.__init__ %SPHINXBUILD% 2> nul if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) :sphinx_ok if "%1" == "html" ( doxygen %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/html. goto end ) if "%1" == "dirhtml" ( %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. goto end ) if "%1" == "singlehtml" ( %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml if errorlevel 1 exit /b 1 echo. echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. goto end ) if "%1" == "pickle" ( %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the pickle files. goto end ) if "%1" == "json" ( %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can process the JSON files. goto end ) if "%1" == "htmlhelp" ( %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run HTML Help Workshop with the ^ .hhp project file in %BUILDDIR%/htmlhelp. goto end ) if "%1" == "qthelp" ( %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp if errorlevel 1 exit /b 1 echo. echo.Build finished; now you can run "qcollectiongenerator" with the ^ .qhcp project file in %BUILDDIR%/qthelp, like this: echo.^> qcollectiongenerator %BUILDDIR%\qthelp\packagename.qhcp echo.To view the help file: echo.^> assistant -collectionFile %BUILDDIR%\qthelp\packagename.ghc goto end ) if "%1" == "devhelp" ( %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp if errorlevel 1 exit /b 1 echo. echo.Build finished. goto end ) if "%1" == "epub" ( %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub if errorlevel 1 exit /b 1 echo. echo.Build finished. The epub file is in %BUILDDIR%/epub. goto end ) if "%1" == "latex" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex if errorlevel 1 exit /b 1 echo. echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdf" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "latexpdfja" ( %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex cd %BUILDDIR%/latex make all-pdf-ja cd %~dp0 echo. echo.Build finished; the PDF files are in %BUILDDIR%/latex. goto end ) if "%1" == "text" ( %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text if errorlevel 1 exit /b 1 echo. echo.Build finished. The text files are in %BUILDDIR%/text. goto end ) if "%1" == "man" ( %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man if errorlevel 1 exit /b 1 echo. echo.Build finished. The manual pages are in %BUILDDIR%/man. goto end ) if "%1" == "texinfo" ( %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo if errorlevel 1 exit /b 1 echo. echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. goto end ) if "%1" == "gettext" ( %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale if errorlevel 1 exit /b 1 echo. echo.Build finished. The message catalogs are in %BUILDDIR%/locale. goto end ) if "%1" == "changes" ( %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes if errorlevel 1 exit /b 1 echo. echo.The overview file is in %BUILDDIR%/changes. goto end ) if "%1" == "linkcheck" ( %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck if errorlevel 1 exit /b 1 echo. echo.Link check complete; look for any errors in the above output ^ or in %BUILDDIR%/linkcheck/output.txt. goto end ) if "%1" == "doctest" ( %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest if errorlevel 1 exit /b 1 echo. echo.Testing of doctests in the sources finished, look at the ^ results in %BUILDDIR%/doctest/output.txt. goto end ) if "%1" == "coverage" ( %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage if errorlevel 1 exit /b 1 echo. echo.Testing of coverage in the sources finished, look at the ^ results in %BUILDDIR%/coverage/python.txt. goto end ) if "%1" == "xml" ( %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml if errorlevel 1 exit /b 1 echo. echo.Build finished. The XML files are in %BUILDDIR%/xml. goto end ) if "%1" == "pseudoxml" ( %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml if errorlevel 1 exit /b 1 echo. echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. goto end ) :end xsimd-13.2.0/docs/source/000077500000000000000000000000001475736624100151705ustar00rootroot00000000000000xsimd-13.2.0/docs/source/_static/000077500000000000000000000000001475736624100166165ustar00rootroot00000000000000xsimd-13.2.0/docs/source/_static/main_stylesheet.css000066400000000000000000000000741475736624100225260ustar00rootroot00000000000000.wy-nav-content{ max-width: 1000px; margin: auto; } xsimd-13.2.0/docs/source/api/000077500000000000000000000000001475736624100157415ustar00rootroot00000000000000xsimd-13.2.0/docs/source/api/aligned_allocator.rst000066400000000000000000000007201475736624100221350ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Alignment manipulation ====================== Aligned memory allocator ------------------------ .. doxygenclass:: xsimd::aligned_allocator :project: xsimd :members: Alignement checker ------------------ .. doxygenfunction:: xsimd::is_aligned :project: xsimd xsimd-13.2.0/docs/source/api/arch.rst000066400000000000000000000017151475736624100174140ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Architecture manipulation ========================= xsimd provides an high level description of the instruction sets it manipulates. The mentioned types are primarily used as template parameters for :ref:`batch `, and when interacting with :cpp:func:`xsimd::dispatch()`. The best available architecture is available at compile time through ``xsimd::best_arch`` which also happens to be ``xsimd::default_arch``. .. doxygengroup:: architectures :project: xsimd :members: Emulated mode ------------- When compiled with the macro ``XSIMD_WITH_EMULATED`` set to ``1``, xsimd also exhibits a specific architecture ``xsimd::emulated``, which consists of a vector of ``N`` bits emulated using scalar mode. It is mostly available for testing and debugging. xsimd-13.2.0/docs/source/api/arithmetic_index.rst000066400000000000000000000116361475736624100220220ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Arithmetic operators ==================== Binary operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`add` | per slot addition | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sub` | per slot subtraction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`mul` | per slot multiply | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`div` | per slot division | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`mod` | per slot modulo | +---------------------------------------+----------------------------------------------------+ Unary operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`neg` | per slot negate | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`pos` | per slot positive | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reciprocal` | per slot reciprocal | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`decr` | per slot decrement | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`decr_if` | per slot decrement, based on a mask | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`incr` | per slot increment | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`incr_if` | per slot increment, based on a mask | +---------------------------------------+----------------------------------------------------+ Saturated arithmetic: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sadd` | per slot saturated addition | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`ssub` | per slot saturated subtraction | +---------------------------------------+----------------------------------------------------+ Fused operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fma` | fused multiply add | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fms` | fused multiply sub | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fnma` | fused negate multiply add | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fnms` | fused negate multiply sub | +---------------------------------------+----------------------------------------------------+ Average computation: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`avg` | per slot average | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`avgr` | per slot rounded average | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_arithmetic :project: xsimd :content-only: xsimd-13.2.0/docs/source/api/batch_index.rst000066400000000000000000000005121475736624100207410ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch types =========== .. toctree:: :maxdepth: 1 xsimd_batch xsimd_batch_bool xsimd_batch_complex xsimd_batch_constant xsimd-13.2.0/docs/source/api/batch_manip.rst000066400000000000000000000012771475736624100207470ustar00rootroot00000000000000.. Copyright (c) 2021, Serge Guelton Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Conditional expression ====================== +------------------------------+-------------------------------------------+ | :cpp:func:`select` | conditional selection with mask | +------------------------------+-------------------------------------------+ ---- .. doxygengroup:: batch_cond :project: xsimd :content-only: In the specific case when one needs to conditionnaly increment or decrement a batch based on a mask, :cpp:func:`incr_if` and :cpp:func:`decr_if` provide specialized version. xsimd-13.2.0/docs/source/api/bitwise_operators_index.rst000066400000000000000000000046051475736624100234330ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Bitwise operators ================= +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_not` | per slot bitwise not | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_or` | per slot bitwise or | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_xor` | per slot bitwise xor | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_and` | per slot bitwise and | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_andnot` | per slot bitwise and not | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_lshift` | per slot bitwise and | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_rshift` | per slot bitwise and not | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotr` | per slot rotate right | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotl` | per slot rotate left | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_bitwise :project: xsimd :content-only: xsimd-13.2.0/docs/source/api/cast_index.rst000077500000000000000000000034001475736624100206140ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Type conversion =============== Cast: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`batch_cast` | ``static_cast`` on batch types | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitwise_cast` | ``reinterpret_cast`` on batch types | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`batch_bool_cast` | ``static_cast`` on batch predicate types | +---------------------------------------+----------------------------------------------------+ Conversion: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`to_float` | per slot conversion to floating point | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`to_int` | per slot conversion to integer | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_conversion :project: xsimd :content-only: xsimd-13.2.0/docs/source/api/comparison_index.rst000066400000000000000000000062741475736624100220450ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Comparison operators ==================== Ordering: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`eq` | per slot equals to comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`neq` | per slot different from comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`gt` | per slot strictly greater than comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`lt` | per slot strictly lower than comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`ge` | per slot greater or equal to comparison | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`le` | per slot lower or equal to comparison | +---------------------------------------+----------------------------------------------------+ Parity check: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`is_even` | per slot check for evenness | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`is_odd` | per slot check for oddness | +---------------------------------------+----------------------------------------------------+ Floating point number check: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`isinf` | per slot check for infinity | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`isnan` | per slot check for NaN | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`isfinite` | per slot check for finite number | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`is_flint` | per slot check for float representing an integer | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_logical :project: xsimd :content-only: xsimd-13.2.0/docs/source/api/data_transfer.rst000066400000000000000000000116351475736624100213160ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Data transfer ============= From memory: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load` | load values from memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_aligned` | load values from aligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_unaligned` | load values from unaligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`load_as` | load values, forcing a type conversion | +---------------------------------------+----------------------------------------------------+ From a scalar: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`broadcast` | broadcasting a value to all slots | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`broadcast_as` | broadcasting a value, forcing a type conversion | +---------------------------------------+----------------------------------------------------+ To memory: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store` | store values to memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_aligned` | store values to aligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_unaligned` | store values to unaligned memory | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`store_as` | store values, forcing a type conversion | +---------------------------------------+----------------------------------------------------+ In place: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`swizzle` | rearrange slots within the batch | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`slide_left` | bitwise shift the whole batch to the left | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`slide_right` | bitwise shift the whole batch to the right | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotate_left` | bitwise rotate the whole batch to the left | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rotate_right` | bitwise rotate the whole batch to the right | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`insert` | modify a single batch slot | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`compress` | pack elements according to a mask | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`expand` | select contiguous elements from the batch | +---------------------------------------+----------------------------------------------------+ Between batches: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`transpose` | tranpose a matrix as an array of batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`zip_lo` | interleave low halves of two batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`zip_hi` | interleave high halves of two batches | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_data_transfer :project: xsimd :content-only: The following empty types are used for tag dispatching: .. doxygenstruct:: xsimd::aligned_mode :project: xsimd .. doxygenstruct:: xsimd::unaligned_mode :project: xsimd xsimd-13.2.0/docs/source/api/dispatching.rst000066400000000000000000000036461475736624100210010ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html .. _Arch Dispatching: Arch Dispatching ================ `xsimd` provides a generic way to dispatch a function call based on the architecture the code was compiled for and the architectures available at runtime. The :cpp:func:`xsimd::dispatch` function takes a functor whose call operator takes an architecture parameter as first operand, followed by any number of arguments ``Args...`` and turn it into a dispatching functor that takes ``Args...`` as arguments. .. doxygenfunction:: xsimd::dispatch :project: xsimd Following code showcases a usage of the :cpp:func:`xsimd::dispatch` function: .. code-block:: c++ #include "sum.hpp" // Create the dispatching function, specifying the architecture we want to // target. auto dispatched = xsimd::dispatch>(sum{}); // Call the appropriate implementation based on runtime information. float res = dispatched(data, 17); This code does *not* require any architecture-specific flags. The architecture specific details follow. The ``sum.hpp`` header contains the function being actually called, in an architecture-agnostic description: .. literalinclude:: ../../../test/doc/sum.hpp The SSE2 and AVX2 version needs to be provided in other compilation units, compiled with the appropriate flags, for instance: .. literalinclude:: ../../../test/doc/sum_avx2.cpp .. literalinclude:: ../../../test/doc/sum_sse2.cpp xsimd-13.2.0/docs/source/api/instr_macros.rst000066400000000000000000000021371475736624100212010ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Instruction set macros ====================== Each of these macros corresponds to an instruction set supported by XSIMD. They can be used to filter arch-specific code. .. doxygengroup:: xsimd_config_macro :project: xsimd :content-only: Changing Default architecture ***************************** You can change the default instruction set used by xsimd (when none is provided explicitely) by setting the ``XSIMD_DEFAULT_ARCH`` macro to, say, ``xsimd::avx2``. A common usage is to set it to ``xsimd::unsupported`` as a way to detect instantiation of batches with the default architecture. xsimd-13.2.0/docs/source/api/math_index.rst000066400000000000000000000245771475736624100206320ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Mathematical functions ====================== Basic functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`abs` | absolute value | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fabs` | absolute value of floating point values | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fmod` | remainder of the floating point division operation | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`remainder` | signed remainder of the division operation | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`min` | smaller of two batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`max` | larger of two batches | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fmin` | smaller of two batches of floating point values | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fmax` | larger of two batches of floating point values | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`fdim` | positive difference | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`clip` | clipping operation | +---------------------------------------+----------------------------------------------------+ Exponential functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`exp` | natural exponential function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`exp2` | base 2 exponential function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`exp10` | base 10 exponential function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`expm1` | natural exponential function, minus one | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log` | natural logarithm function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log2` | base 2 logarithm function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log10` | base 10 logarithm function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`log1p` | natural logarithm of one plus function | +---------------------------------------+----------------------------------------------------+ Power functions: +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`pow` | power function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`rsqrt` | reciprocal square root function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`sqrt` | square root function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`cbrt` | cubic root function | +-----------------------------------------+----------------------------------------------------+ | :cpp:func:`hypot` | hypotenuse function | +-----------------------------------------+----------------------------------------------------+ Trigonometric functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sin` | sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`cos` | cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sincos` | sine and cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`tan` | tangent function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`asin` | arc sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`acos` | arc cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`atan` | arc tangent function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`atan2` | arc tangent function, determining quadrants | +---------------------------------------+----------------------------------------------------+ Hyperbolic functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sinh` | hyperbolic sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`cosh` | hyperbolic cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`tanh` | hyperbolic tangent function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`asinh` | inverse hyperbolic sine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`acosh` | inverse hyperbolic cosine function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`atanh` | inverse hyperbolic tangent function | +---------------------------------------+----------------------------------------------------+ Error functions: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`erf` | error function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`erfc` | complementary error function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`tgamma` | gamma function | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`lgamma` | natural logarithm of the gamma function | +---------------------------------------+----------------------------------------------------+ Nearint operations: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`ceil` | nearest integers not less | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`floor` | nearest integers not greater | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`trunc` | nearest integers not greater in magnitude | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`round` | nearest integers, rounding away from zero | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`nearbyint` | nearest integers using current rounding mode | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`rint` | nearest integers using current rounding mode | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_math :project: xsimd :content-only: .. doxygengroup:: batch_trigo :project: xsimd :content-only: .. doxygengroup:: batch_rounding :project: xsimd :content-only: .. doxygengroup:: batch_math_extra :project: xsimd :content-only: xsimd-13.2.0/docs/source/api/misc_index.rst000066400000000000000000000034221475736624100206160ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Miscellaneous ============= Sign manipulation: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`sign` | per slot sign extraction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`signnz` | per slot sign extraction on non null elements | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`bitofsign` | per slot sign bit extraction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`copysign` | per slot sign copy | +---------------------------------------+----------------------------------------------------+ Stream operation: +---------------------------------------+----------------------------------------------------+ | :cpp:func:`operator<<` | batch pretty-printing | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_miscellaneous :project: xsimd :content-only: xsimd-13.2.0/docs/source/api/reducer_index.rst000066400000000000000000000032211475736624100213110ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Reduction operators =================== +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce` | generic batch reduction | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce_add` | sum of each batch element | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce_max` | max of the batch elements | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`reduce_min` | min of the batch elements | +---------------------------------------+----------------------------------------------------+ | :cpp:func:`haddp` | horizontal sum across batches | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_reducers :project: xsimd :content-only: xsimd-13.2.0/docs/source/api/type_traits.rst000066400000000000000000000035641475736624100210520ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html .. _Type Traits: Type Traits =========== `xsimd` provides a few type traits to interact with scalar and batch types in an uniformeous manner. Type check: +---------------------------------------+----------------------------------------------------+ | :cpp:class:`is_batch` | batch type detection | +---------------------------------------+----------------------------------------------------+ | :cpp:class:`is_batch_bool` | mask batch type detection | +---------------------------------------+----------------------------------------------------+ | :cpp:class:`is_batch_complex` | complex batch type detection | +---------------------------------------+----------------------------------------------------+ Type access: +---------------------------------------+----------------------------------------------------+ | :cpp:class:`scalar_type` | batch element type | +---------------------------------------+----------------------------------------------------+ | :cpp:class:`mask_type` | batch mask type | +---------------------------------------+----------------------------------------------------+ ---- .. doxygengroup:: batch_traits :project: xsimd :content-only: xsimd-13.2.0/docs/source/api/xsimd_batch.rst000066400000000000000000000006061475736624100207620ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of scalars ================ .. _xsimd-batch-ref: .. doxygenclass:: xsimd::batch :project: xsimd :members: .. doxygenstruct:: xsimd::make_sized_batch :project: xsimd :members: xsimd-13.2.0/docs/source/api/xsimd_batch_bool.rst000066400000000000000000000010311475736624100217660ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of conditions =================== .. _xsimd-batch-bool-ref: .. doxygenclass:: xsimd::batch_bool :project: xsimd :members: Logical operators ----------------- .. doxygengroup:: batch_bool_logical :project: xsimd :content-only: Reducers -------- .. doxygengroup:: batch_bool_reducers :project: xsimd :content-only: xsimd-13.2.0/docs/source/api/xsimd_batch_complex.rst000066400000000000000000000014261475736624100225120ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of complex numbers ======================== .. doxygenclass:: xsimd::batch< std::complex< T >, A > :project: xsimd :members: Operations specific to batches of complex numbers ------------------------------------------------- .. doxygengroup:: batch_complex :project: xsimd :content-only: XTL complex support ------------------- If the preprocessor token ``XSIMD_ENABLE_XTL_COMPLEX`` is defined, ``xsimd`` provides constructors of ``xsimd::batch< std::complex< T >, A >`` from ``xtl::xcomplex``, similar to those for ``std::complex``. This requires ``xtl`` to be installed. xsimd-13.2.0/docs/source/api/xsimd_batch_constant.rst000066400000000000000000000006241475736624100226730ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille, Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Batch of constants ================== .. _xsimd-batch-constant-ref: .. doxygenstruct:: xsimd::batch_constant :project: xsimd :members: .. doxygenfunction:: xsimd::make_batch_constant :project: xsimd xsimd-13.2.0/docs/source/basic_usage.rst000066400000000000000000000032401475736624100201660ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Basic usage =========== Manipulating abstract batches ----------------------------- Here is an example that computes the mean of two batches, using the best architecture available, based on compile time informations: .. literalinclude:: ../../test/doc/manipulating_abstract_batches.cpp The batch can be a batch of 4 single precision floating point numbers (e.g. on Neon) or a batch of 8 (e.g. on AVX2). Manipulating parametric batches ------------------------------- The previous example can be made fully parametric, both in the batch type and the underlying architecture. This is achieved as described in the following example: .. literalinclude:: ../../test/doc/manipulating_parametric_batches.cpp At its core, a :cpp:class:`xsimd::batch` is bound to the scalar type it contains, and to the instruction set it can use to operate on its values. Explicit use of an instruction set extension -------------------------------------------- Here is an example that loads two batches of 4 double floating point values, and computes their mean, explicitly using the AVX extension: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set.cpp Note that in that case, the instruction set is explicilty specified in the batch type. This example outputs: .. code:: (2.0, 3.0, 4.0, 5.0) .. warning:: If you allow your compiler to generate AVX2 instructions (e.g. through ``-mavx2``) there is nothing preventing it from optimizing the above code using AVX2 instructions. xsimd-13.2.0/docs/source/cmake.svg000066400000000000000000000425311475736624100167760ustar00rootroot00000000000000 image/svg+xml xsimd-13.2.0/docs/source/conda.svg000066400000000000000000000034151475736624100170000ustar00rootroot00000000000000xsimd-13.2.0/docs/source/conf.py000066400000000000000000000015321475736624100164700ustar00rootroot00000000000000#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import subprocess on_rtd = os.environ.get('READTHEDOCS', None) == 'True' if on_rtd: subprocess.check_call('cd ..; doxygen', shell=True) import sphinx_rtd_theme html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] def setup(app): app.add_css_file("main_stylesheet.css") extensions = ['breathe', 'sphinx_rtd_theme'] breathe_projects = { 'xsimd': '../xml' } templates_path = ['_templates'] html_static_path = ['_static'] source_suffix = '.rst' master_doc = 'index' project = 'xsimd' copyright = '2016, Johan Mabille and Sylvain Corlay' author = 'Johan Mabille and Sylvain Corlay' html_logo = 'quantstack-white.svg' exclude_patterns = [] highlight_language = 'c++' pygments_style = 'sphinx' todo_include_todos = False htmlhelp_basename = 'xsimddoc' xsimd-13.2.0/docs/source/index.rst000066400000000000000000000075061475736624100170410ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. image:: xsimd.svg :alt: xsimd C++ wrappers for SIMD intrinsics. Introduction ------------ SIMD (Single Instruction, Multiple Data) is a feature of microprocessors that has been available for many years. SIMD instructions perform a single operation on a batch of values at once, and thus provide a way to significantly accelerate code execution. However, these instructions differ between microprocessor vendors and compilers. `xsimd` provides a unified means for using these features for library authors. Namely, it enables manipulation of batches of scalar and complex numbers with the same arithmetic operators and common mathematical functions as for single values. `xsimd` makes it easy to write a single algorithm, generate one version of the algorithm per micro-architecture and pick the best one at runtime, based on the running processor capability. You can find out more about this implementation of C++ wrappers for SIMD intrinsics at the `The C++ Scientist`_. The mathematical functions are a lightweight implementation of the algorithms also used in `boost.SIMD`_. `xsimd` requires a C++11 compliant compiler. The following C++ compilers are supported: +-------------------------+-------------------------------+ | Compiler | Version | +=========================+===============================+ | Microsoft Visual Studio | MSVC 2015 update 2 and above | +-------------------------+-------------------------------+ | g++ | 4.9 and above | +-------------------------+-------------------------------+ | clang | 3.7 and above | +-------------------------+-------------------------------+ The following SIMD instruction set extensions are supported: +--------------+---------------------------------------------------------+ | Architecture | Instruction set extensions | +==============+=========================================================+ | x86 | SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, FMA3, AVX2 | +--------------+---------------------------------------------------------+ | x86 | AVX512 (gcc7 and higher) | +--------------+---------------------------------------------------------+ | x86 AMD | same as above + FMA4 | +--------------+---------------------------------------------------------+ | ARM | ARMv7, ARMv8 | +--------------+---------------------------------------------------------+ | WebAssembly | WASM | +--------------+---------------------------------------------------------+ Licensing --------- We use a shared copyright model that enables all contributors to maintain the copyright on their contributions. This software is licensed under the BSD-3-Clause license. See the LICENSE file for details. .. toctree:: :caption: INSTALLATION :maxdepth: 2 installation .. toctree:: :caption: USAGE :maxdepth: 2 basic_usage vectorized_code .. toctree:: :caption: API REFERENCE :maxdepth: 1 api/instr_macros api/batch_index api/data_transfer api/arithmetic_index api/comparison_index api/bitwise_operators_index api/math_index api/reducer_index api/cast_index api/type_traits api/batch_manip api/misc_index api/aligned_allocator api/arch api/dispatching .. toctree:: :caption: MIGRATION GUIDE :maxdepth: 1 migration_guide .. _The C++ Scientist: http://johanmabille.github.io/blog/archives/ .. _boost.SIMD: https://github.com/NumScale/boost.simd xsimd-13.2.0/docs/source/installation.rst000066400000000000000000000034341475736624100204270ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html Installation ============ `xsimd` is a header-only library, so installing it is just a matter of copying the ``include/xsimd`` directory. However we provide standardized means to install it, with package managers or with cmake. Besides the `xsimd` headers, all these methods place the ``cmake`` project configuration file in the right location so that third-party projects can use cmake's ``find_package`` to locate `xsimd` headers. .. image:: conda.svg Using the conda-forge package ----------------------------- A package for `xsimd` is available for the `mamba `_ (or `conda `_) package manager. .. code:: mamba install -c conda-forge xsimd .. image:: spack.svg Using the Spack package ----------------------- A package for `xsimd` is available on the `Spack `_ package manager. .. code:: spack install xsimd spack load xsimd .. image:: cmake.svg From source with cmake ---------------------- You can install `xsimd` from source with `cmake `_. On Unix platforms, from the source directory: .. code:: mkdir build cd build cmake -DCMAKE_INSTALL_PREFIX=/path/to/prefix .. make install On Windows platforms, from the source directory: .. code:: mkdir build cd build cmake -G "NMake Makefiles" -DCMAKE_INSTALL_PREFIX=/path/to/prefix .. nmake nmake install xsimd-13.2.0/docs/source/migration_guide.rst000066400000000000000000000056031475736624100210740ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. .. raw:: html From 7.x to 8.x =============== Version 8.x introduces a lot of API difference compared to version 7.x. This section motivates the version bump and details the most notable changes. Why 8.x ------- Version 8.x introduces a new concept in `xsimd`: all batch types are now parametrized by a type, say ``double``, and an optional architecture, say ``avx512``, as in ``batch``. It is still possible to just require a batch of doubles and let the library pick the most appropriate architecture, as in ``batch``. This new design make it possible to target multiple architecture from the same code, as detailed in the :ref:`Arch Dispatching` section. As a side effect of this (almost full) rewrite of the library code, `xsimd` is now twice as fast to compile, and its source code size as been (roughly) divided by two. The `xsimd` developers also took this as an opportnuity to significantly improve test coverage. Most Notable Changes -------------------- Batch Types *********** The second argument of :cpp:class:`xsimd::batch` is now a type that represents an architecture, instead of an integer. The previous behavior can be emulated through the :cpp:class:`xsimd::make_sized_batch` utility. Batch of Complex Types ********************** Loading a batch of complex from an ``xtl::xcomplex`` now yields an ``xsimd::batch>`` instead of an ``xtl::xcomplex``. It is still possible to store an ``xsimd::batch>`` to an ``xtl::xcomplex``. Loading Batches *************** ``xsimd::batch::load*`` are now static functions. It is no longer supported to update an existing batch through its ``load`` method. The regular assign operator can be used instead. Indexing Batches **************** ``xsimd::batch::operator[](size_t)`` has been replaced with ``xsimd::batch::get(size_t)``. Keep in mind that this method implies a register load *for each call*, so it's wise not to use it in performance-critical section. When needed, do an explicit store of the batch into an array and work from there. Architecture Detection ********************** Many macros have been replaced by more elaborated constructs. ``XSIMD_INSTR_SET_AVAILABLE`` has been replaced by the type alias ``xsimd::default_arch``. Likewise architecture-specific macros like ``XSIMD_X86_INSTR_SET_AVAILABLE`` has been replaced by ``xsimd::upported_architectures::contains()``. Macro like ``XSIMD_WITH_SSE3`` are still defined to ``0`` or ``1`` to guard architecture-specific code. xsimd-13.2.0/docs/source/quantstack-white.svg000066400000000000000000000116361475736624100212140ustar00rootroot00000000000000 image/svg+xmlxsimd-13.2.0/docs/source/spack.svg000066400000000000000000000046711475736624100170220ustar00rootroot00000000000000 xsimd-13.2.0/docs/source/vectorized_code.rst000066400000000000000000000074451475736624100211040ustar00rootroot00000000000000.. Copyright (c) 2016, Johan Mabille and Sylvain Corlay Distributed under the terms of the BSD 3-Clause License. The full license is in the file LICENSE, distributed with this software. Writing vectorized code ======================= Assume that we have a simple function that computes the mean of two vectors, something like: .. literalinclude:: ../../test/doc/writing_vectorized_code.cpp How can we use `xsimd` to take advantage of vectorization? Explicit use of an instruction set ---------------------------------- `xsimd` provides the template class :cpp:class:`xsimd::batch` parametrized by ``T`` and ``A`` types where ``T`` is the type of the values involved in SIMD instructions and ``A`` is the target architecture. If you know which instruction set is available on your machine, you can directly use the corresponding specialization of ``batch``. For instance, assuming the AVX instruction set is available, the previous code can be vectorized the following way: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean.cpp However, if you want to write code that is portable, you cannot rely on the use of ``batch``. Indeed this won't compile on a CPU where only SSE2 instruction set is available for instance. Fortunately, if you don't set the second template parameter, `xsimd` picks the best architecture among the one available, based on the compiler flag you use. Aligned vs unaligned memory --------------------------- In the previous example, you may have noticed the :cpp:func:`xsimd::batch::load_unaligned` and :cpp:func:`xsimd::batch::store_unaligned` functions. These are meant for loading values from contiguous dynamically allocated memory into SIMD registers and reciprocally. When dealing with memory transfer operations, some instructions sets required the memory to be aligned by a given amount, others can handle both aligned and unaligned modes. In that latter case, operating on aligned memory is generally faster than operating on unaligned memory. `xsimd` provides an aligned memory allocator, namely :cpp:class:`xsimd::aligned_allocator` which follows the standard requirements, so it can be used with STL containers. Let's change the previous code so it can take advantage of this allocator: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_aligned.cpp Memory alignment and tag dispatching ------------------------------------ You may need to write code that can operate on any type of vectors or arrays, not only the STL ones. In that case, you cannot make assumption on the memory alignment of the container. `xsimd` provides a tag dispatching mechanism that allows you to easily write such a generic code: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_tag_dispatch.cpp Here, the ``Tag`` template parameter can be :cpp:class:`xsimd::aligned_mode` or :cpp:class:`xsimd::unaligned_mode`. Assuming the existence of a ``get_alignment_tag`` meta-function in the code, the previous code can be invoked this way: .. code:: mean(a, b, res, get_alignment_tag()); Writing arch-independent code ----------------------------- If your code may target either SSE2, AVX2 or AVX512 instruction set, `xsimd` make it possible to make your code even more generic by using the architecture as a template parameter: .. literalinclude:: ../../test/doc/explicit_use_of_an_instruction_set_mean_arch_independent.cpp This can be useful to implement runtime dispatching, based on the instruction set detected at runtime. `xsimd` provides a generic machinery :cpp:func:`xsimd::dispatch()` to implement this pattern. Based on the above example, instead of calling ``mean{}(arch, a, b, res, tag)``, one can use ``xsimd::dispatch(mean{})(a, b, res, tag)``. More about this can be found in the :ref:`Arch Dispatching` section. xsimd-13.2.0/docs/source/xsimd.svg000066400000000000000000000055741475736624100170500ustar00rootroot00000000000000 xsimd-13.2.0/environment.yml000066400000000000000000000001201475736624100160200ustar00rootroot00000000000000name: xsimd channels: - conda-forge dependencies: - ninja - xtl - doctest xsimd-13.2.0/examples/000077500000000000000000000000001475736624100145565ustar00rootroot00000000000000xsimd-13.2.0/examples/CMakeLists.txt000066400000000000000000000037541475736624100173270ustar00rootroot00000000000000############################################################################ # Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and # # Martin Renou # # Copyright (c) QuantStack # # Copyright (c) Serge Guelton # # # # Distributed under the terms of the BSD 3-Clause License. # # # # The full license is in the file LICENSE, distributed with this software. # ############################################################################ cmake_minimum_required(VERSION 3.1) if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) project(xsimd-examples) find_package(xsimd REQUIRED CONFIG) set(XSIMD_INCLUDE_DIR ${xsimd_INCLUDE_DIR}) endif () include_directories(${XSIMD_INCLUDE_DIR}) if(NOT CMAKE_BUILD_TYPE) message(STATUS "Setting examples build type to Release") set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) else() message(STATUS "Tests build type is ${CMAKE_BUILD_TYPE}") endif() if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Intel") if (NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mtune=native") endif() if(NOT CMAKE_CXX_COMPILER_ID MATCHES Clang) # We are using clang-cl set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") endif() endif() add_executable(mandelbrot mandelbrot.cpp ${XSIMD_HEADERS}) set_property(TARGET mandelbrot PROPERTY CXX_STANDARD 14) if(ENABLE_XTL_COMPLEX) target_link_libraries(mandelbrot PRIVATE xtl) endif() add_custom_target(xmandelbrot COMMAND mandelbrot DEPENDS mandelbrot) xsimd-13.2.0/examples/mandelbrot.cpp000066400000000000000000000233061475736624100174150ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ // This file is derived from tsimd (MIT License) // https://github.com/ospray/tsimd/blob/master/benchmarks/mandelbrot.cpp // Author Jefferson Amstutz / intel #include #include #include #include #include "pico_bench.hpp" #include // helper function to write the rendered image as PPM file inline void writePPM(const std::string& fileName, const int sizeX, const int sizeY, const int* pixel) { FILE* file = fopen(fileName.c_str(), "wb"); fprintf(file, "P6\n%i %i\n255\n", sizeX, sizeY); unsigned char* out = (unsigned char*)alloca(3 * sizeX); for (int y = 0; y < sizeY; y++) { const unsigned char* in = (const unsigned char*)&pixel[(sizeY - 1 - y) * sizeX]; for (int x = 0; x < sizeX; x++) { out[3 * x + 0] = in[4 * x + 0]; out[3 * x + 1] = in[4 * x + 1]; out[3 * x + 2] = in[4 * x + 2]; } fwrite(out, 3 * sizeX, sizeof(char), file); } fprintf(file, "\n"); fclose(file); } namespace xsimd { template inline batch mandel(const batch_bool& _active, const batch& c_re, const batch& c_im, int maxIters) { using float_batch_type = batch; using int_batch_type = batch; constexpr std::size_t N = float_batch_type::size; float_batch_type z_re = c_re; float_batch_type z_im = c_im; int_batch_type vi(0); for (int i = 0; i < maxIters; ++i) { auto active = _active & ((z_re * z_re + z_im * z_im) <= float_batch_type(4.f)); if (!xsimd::any(active)) { break; } float_batch_type new_re = z_re * z_re - z_im * z_im; float_batch_type new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; vi = select(batch_bool_cast(active), vi + 1, vi); } return vi; } template void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIters, int output[]) { using float_batch_type = batch; using int_batch_type = batch; constexpr std::size_t N = float_batch_type::size; float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; float arange[N]; std::iota(&arange[0], &arange[N], 0.f); // float_batch_type programIndex(&arange[0], xsimd::aligned_mode()); auto programIndex = float_batch_type::load(&arange[0], xsimd::aligned_mode()); // std::iota(programIndex.begin(), programIndex.end(), 0.f); for (int j = 0; j < height; j++) { for (int i = 0; i < width; i += N) { float_batch_type x(x0 + (i + programIndex) * dx); float_batch_type y(y0 + j * dy); auto active = x < float_batch_type(width); int base_index = (j * width + i); auto result = mandel(active, x, y, maxIters); // implement masked store! // xsimd::store_aligned(result, output + base_index, active); int_batch_type prev_data = int_batch_type::load_unaligned(output + base_index); select(batch_bool_cast(active), result, prev_data) .store_aligned(output + base_index); } } } } // namespace xsimd // omp version //////////////////////////////////////////////////////////////// namespace omp { #pragma omp declare simd template inline int mandel(T c_re, T c_im, int count) { T z_re = c_re, z_im = c_im; int i; for (i = 0; i < count; ++i) { if (z_re * z_re + z_im * z_im > 4.f) { break; } T new_re = z_re * z_re - z_im * z_im; T new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; } return i; } void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIterations, int output[]) { float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; for (int j = 0; j < height; j++) { #pragma omp simd for (int i = 0; i < width; ++i) { float x = x0 + i * dx; float y = y0 + j * dy; int index = (j * width + i); output[index] = mandel(x, y, maxIterations); } } } } // namespace omp // scalar version ///////////////////////////////////////////////////////////// namespace scalar { inline int mandel(float c_re, float c_im, int count) { float z_re = c_re, z_im = c_im; int i; for (i = 0; i < count; ++i) { if (z_re * z_re + z_im * z_im > 4.f) { break; } float new_re = z_re * z_re - z_im * z_im; float new_im = 2.f * z_re * z_im; z_re = c_re + new_re; z_im = c_im + new_im; } return i; } void mandelbrot(float x0, float y0, float x1, float y1, int width, int height, int maxIterations, int output[]) { float dx = (x1 - x0) / width; float dy = (y1 - y0) / height; for (int j = 0; j < height; j++) { for (int i = 0; i < width; ++i) { float x = x0 + i * dx; float y = y0 + j * dy; int index = (j * width + i); output[index] = mandel(x, y, maxIterations); } } } } // namespace scalar // run simd version of mandelbrot benchmark for a specific arch template void run_arch( bencher_t& bencher, float x0, float y0, float x1, float y1, int width, int height, int maxIters, std::vector>& buffer) { std::fill(buffer.begin(), buffer.end(), 0); auto stats = bencher([&]() { xsimd::mandelbrot(x0, y0, x1, y1, width, height, maxIters, buffer.data()); }); const float scalar_min = stats.min().count(); std::cout << '\n' << arch::name() << " " << stats << '\n'; auto filename = std::string("mandelbrot_") + std::string(arch::name()) + std::string(".ppm"); writePPM(filename.c_str(), width, height, buffer.data()); } template struct run_archlist; // run simd version of mandelbrot benchmark for a list // of archs template struct run_archlist> { template static void run( bencher_t& bencher, float x0, float y0, float x1, float y1, int width, int height, int maxIters, std::vector>& buffer) { (void)std::initializer_list { (run_arch(bencher, x0, y0, x1, x1, width, height, maxIters, buffer), 0)... }; } }; int main() { using namespace std::chrono; const unsigned int width = 1024; const unsigned int height = 768; const float x0 = -2; const float x1 = 1; const float y0 = -1; const float y1 = 1; const int maxIters = 256; std::vector> buf(width * height); auto bencher = pico_bench::Benchmarker { 64, seconds { 10 } }; std::cout << "starting benchmarks (results in 'ms')... " << '\n'; // scalar run /////////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_scalar = bencher([&]() { scalar::mandelbrot(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float scalar_min = stats_scalar.min().count(); std::cout << '\n' << "scalar " << stats_scalar << '\n'; writePPM("mandelbrot_scalar.ppm", width, height, buf.data()); // omp run ////////////////////////////////////////////////////////////////// std::fill(buf.begin(), buf.end(), 0); auto stats_omp = bencher([&]() { omp::mandelbrot(x0, y0, x1, y1, width, height, maxIters, buf.data()); }); const float omp_min = stats_omp.min().count(); std::cout << '\n' << "omp " << stats_omp << '\n'; writePPM("mandelbrot_omp.ppm", width, height, buf.data()); run_archlist::run(bencher, x0, y0, x1, y1, width, height, maxIters, buf); return 0; } xsimd-13.2.0/examples/pico_bench.hpp000066400000000000000000000177161475736624100173740ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ // This file is derived from tsimd (MIT License) // https://github.com/ospray/tsimd/blob/master/benchmarks/pico_bench.h // Author Jefferson Amstutz / intel #ifndef PICO_BENCH_H #define PICO_BENCH_H #include #include #include #include #include #include #include #include #include #include namespace pico_bench { /* Statistics on some time measurement value T, e.g. T = * std::chrono::milliseconds T must be some std::chrono::duration type */ template class Statistics { using rep = typename T::rep; std::vector samples; public: std::string time_suffix; Statistics(std::vector s) : samples(s) { std::sort(std::begin(samples), std::end(samples)); } T percentile(const float p) const { return percentile(p, samples); } // Winsorize the data, sets all entries above 100 - limit percentile and // below limit percentile to the value of that percentile void winsorize(const float limit) { winsorize(limit, samples); } T median() const { return percentile(50.0, samples); } T median_abs_dev() const { const auto m = median(); std::vector deviations; deviations.reserve(samples.size()); std::transform(std::begin(samples), std::end(samples), std::back_inserter(deviations), [&m](const T& t) { return T { std::abs((t - m).count()) }; }); std::sort(std::begin(deviations), std::end(deviations)); return percentile(50.0, deviations); } T mean() const { const auto m = std::accumulate(std::begin(samples), std::end(samples), T { 0 }); return m / samples.size(); } T std_dev() const { const auto m = mean(); auto val = std::accumulate( std::begin(samples), std::end(samples), T { 0 }, [&m](const T& p, const T& t) { return T { static_cast(p.count() + std::pow((t - m).count(), 2)) }; }); return T { static_cast(std::sqrt(1.0 / static_cast(samples.size()) * static_cast(val.count()))) }; } T min() const { return samples.front(); } T max() const { return samples.back(); } std::size_t size() const { return samples.size(); } const T& operator[](size_t i) const { return samples[i]; } private: // Winsorize the data, sets all entries above 100 - limit percentile and // below limit percentile to the value of that percentile static void winsorize(const float limit, std::vector& samples) { const auto low = percentile(limit, samples); const auto high = percentile(100.0 - limit, samples); for (auto& t : samples) { if (t < low) { t = low; } else if (t > high) { t = high; } } } static T percentile(const float p, const std::vector& samples) { assert(!samples.empty()); assert(p <= 100.0); assert(p >= 0.0); if (samples.size() == 1) { return samples.front(); } if (p == 100.0) { return samples.back(); } const double rank = p / 100.0 * (static_cast(samples.size()) - 1.0); const double low_r = std::floor(rank); const double dist = rank - low_r; const size_t k = static_cast(low_r); const auto low = samples[k]; const auto high = samples[k + 1]; return T { static_cast(low.count() + (high - low).count() * dist) }; } }; /* Benchmarking measurment using some desired unit of time measurement, * e.g. T = std::chrono::milliseconds. T must be some std::chrono::duration */ template class Benchmarker { const size_t MAX_ITER; const T MAX_RUNTIME; template struct BenchWrapper { Fn fn; BenchWrapper(Fn fn) : fn(fn) { } T operator()() { auto start = std::chrono::high_resolution_clock::now(); fn(); auto end = std::chrono::high_resolution_clock::now(); return std::chrono::duration_cast(end - start); } }; public: using stats_type = Statistics; // Benchmark the functions either max_iter times or until max_runtime // seconds have elapsed max_runtime should be > 0 Benchmarker(const size_t max_iter, const std::chrono::seconds max_runtime) : MAX_ITER(max_iter) , MAX_RUNTIME(std::chrono::duration_cast(max_runtime)) { } // Create a benchmarker that will run the function for the desired number of // iterations, regardless of how long it takes Benchmarker(const size_t max_iter) : MAX_ITER(max_iter) , MAX_RUNTIME(0) { } template typename std::enable_if()())>::value, stats_type>::type operator()(Fn fn) const { return (*this)(BenchWrapper { fn }); } template typename std::enable_if()()), T>::value, stats_type>::type operator()(Fn fn) const { // Do a single un-timed warm up run fn(); T elapsed { 0 }; std::vector samples; for (size_t i = 0; i < MAX_ITER && (MAX_RUNTIME.count() == 0 || elapsed < MAX_RUNTIME); ++i, elapsed += samples.back()) { samples.push_back(fn()); } return stats_type { samples }; } }; } // namespace pico_bench template std::ostream& operator<<(std::ostream& os, const pico_bench::Statistics& stats) { os << "Statistics:\n" << "\tmax: " << stats.max().count() << stats.time_suffix << "\n" << "\tmin: " << stats.min().count() << stats.time_suffix << "\n" << "\tmedian: " << stats.median().count() << stats.time_suffix << "\n" << "\tmedian abs dev: " << stats.median_abs_dev().count() << stats.time_suffix << "\n" << "\tmean: " << stats.mean().count() << stats.time_suffix << "\n" << "\tstd dev: " << stats.std_dev().count() << stats.time_suffix << "\n" << "\t# of samples: " << stats.size(); return os; } #endif xsimd-13.2.0/include/000077500000000000000000000000001475736624100143635ustar00rootroot00000000000000xsimd-13.2.0/include/xsimd/000077500000000000000000000000001475736624100155075ustar00rootroot00000000000000xsimd-13.2.0/include/xsimd/arch/000077500000000000000000000000001475736624100164245ustar00rootroot00000000000000xsimd-13.2.0/include/xsimd/arch/generic/000077500000000000000000000000001475736624100200405ustar00rootroot00000000000000xsimd-13.2.0/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp000066400000000000000000000235751475736624100256160ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_ARITHMETIC_HPP #define XSIMD_GENERIC_ARITHMETIC_HPP #include #include #include #include "./xsimd_generic_details.hpp" namespace xsimd { namespace kernel { using namespace types; // bitwise_lshift template ::value, void>::type*/> XSIMD_INLINE batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x << y; }, self, other); } // bitwise_rshift template ::value, void>::type*/> XSIMD_INLINE batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x >> y; }, self, other); } // decr template XSIMD_INLINE batch decr(batch const& self, requires_arch) noexcept { return self - T(1); } // decr_if template XSIMD_INLINE batch decr_if(batch const& self, Mask const& mask, requires_arch) noexcept { return select(mask, decr(self), self); } // div template ::value, void>::type> XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept -> T { return x / y; }, self, other); } // fma template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return x * y + z; } template XSIMD_INLINE batch, A> fma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // fms template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return x * y - z; } template XSIMD_INLINE batch, A> fms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // fnma template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return -x * y + z; } template XSIMD_INLINE batch, A> fnma(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real())); auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // fnms template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return -x * y - z; } template XSIMD_INLINE batch, A> fnms(batch, A> const& x, batch, A> const& y, batch, A> const& z, requires_arch) noexcept { auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real())); auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag())); return { res_r, res_i }; } // hadd template ::value, void>::type*/> XSIMD_INLINE T hadd(batch const& self, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch::size]; self.store_aligned(buffer); T res = 0; for (T val : buffer) { res += val; } return res; } // incr template XSIMD_INLINE batch incr(batch const& self, requires_arch) noexcept { return self + T(1); } // incr_if template XSIMD_INLINE batch incr_if(batch const& self, Mask const& mask, requires_arch) noexcept { return select(mask, incr(self), self); } // mul template ::value, void>::type*/> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept -> T { return x * y; }, self, other); } // rotl template XSIMD_INLINE batch rotl(batch const& self, STy other, requires_arch) noexcept { constexpr auto N = std::numeric_limits::digits; return (self << other) | (self >> (N - other)); } // rotr template XSIMD_INLINE batch rotr(batch const& self, STy other, requires_arch) noexcept { constexpr auto N = std::numeric_limits::digits; return (self >> other) | (self << (N - other)); } // sadd template XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { return add(self, other); // no saturated arithmetic on floating point numbers } template ::value, void>::type*/> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { auto mask = (other >> (8 * sizeof(T) - 1)); auto self_pos_branch = min(std::numeric_limits::max() - other, self); auto self_neg_branch = max(std::numeric_limits::min() - other, self); return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); } else { const auto diffmax = std::numeric_limits::max() - self; const auto mindiff = min(diffmax, other); return self + mindiff; } } template XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { return add(self, other); // no saturated arithmetic on floating point numbers } // ssub template XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { return sub(self, other); // no saturated arithmetic on floating point numbers } template ::value, void>::type*/> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { return sadd(self, -other); } else { const auto diff = min(self, other); return self - diff; } } template XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { return sub(self, other); // no saturated arithmetic on floating point numbers } } } #endif xsimd-13.2.0/include/xsimd/arch/generic/xsimd_generic_complex.hpp000066400000000000000000000076651475736624100251360ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_COMPLEX_HPP #define XSIMD_GENERIC_COMPLEX_HPP #include #include "./xsimd_generic_details.hpp" namespace xsimd { namespace kernel { using namespace types; // real template XSIMD_INLINE batch real(batch const& self, requires_arch) noexcept { return self; } template XSIMD_INLINE batch real(batch, A> const& self, requires_arch) noexcept { return self.real(); } // imag template XSIMD_INLINE batch imag(batch const& /*self*/, requires_arch) noexcept { return batch(T(0)); } template XSIMD_INLINE batch imag(batch, A> const& self, requires_arch) noexcept { return self.imag(); } // arg template XSIMD_INLINE real_batch_type_t> arg(batch const& self, requires_arch) noexcept { return atan2(imag(self), real(self)); } // conj template XSIMD_INLINE complex_batch_type_t> conj(batch const& self, requires_arch) noexcept { return { real(self), -imag(self) }; } // norm template XSIMD_INLINE real_batch_type_t> norm(batch const& self, requires_arch) noexcept { return { fma(real(self), real(self), imag(self) * imag(self)) }; } // proj template XSIMD_INLINE complex_batch_type_t> proj(batch const& self, requires_arch) noexcept { using batch_type = complex_batch_type_t>; using real_batch = typename batch_type::real_batch; using real_value_type = typename real_batch::value_type; auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self)); return select(cond, batch_type(constants::infinity(), copysign(real_batch(real_value_type(0)), imag(self))), batch_type(self)); } template XSIMD_INLINE batch_bool isnan(batch, A> const& self, requires_arch) noexcept { return batch_bool(isnan(self.real()) || isnan(self.imag())); } template XSIMD_INLINE batch_bool isinf(batch, A> const& self, requires_arch) noexcept { return batch_bool(isinf(self.real()) || isinf(self.imag())); } template XSIMD_INLINE batch_bool isfinite(batch, A> const& self, requires_arch) noexcept { return batch_bool(isfinite(self.real()) && isfinite(self.imag())); } } } #endif xsimd-13.2.0/include/xsimd/arch/generic/xsimd_generic_details.hpp000066400000000000000000000357751475736624100251170ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_DETAILS_HPP #define XSIMD_GENERIC_DETAILS_HPP #include #include "../../math/xsimd_rem_pio2.hpp" #include "../../types/xsimd_generic_arch.hpp" #include "../../types/xsimd_utils.hpp" #include "../xsimd_constants.hpp" namespace xsimd { // Forward declaration. Should we put them in a separate file? template XSIMD_INLINE batch abs(batch const& self) noexcept; template XSIMD_INLINE batch abs(batch, A> const& self) noexcept; template XSIMD_INLINE bool any(batch_bool const& self) noexcept; template XSIMD_INLINE batch atan2(batch const& self, batch const& other) noexcept; template XSIMD_INLINE batch batch_cast(batch const&, batch const& out) noexcept; template XSIMD_INLINE batch bitofsign(batch const& self) noexcept; template XSIMD_INLINE batch bitwise_cast(batch const& self) noexcept; template XSIMD_INLINE batch cos(batch const& self) noexcept; template XSIMD_INLINE batch cosh(batch const& self) noexcept; template XSIMD_INLINE batch exp(batch const& self) noexcept; template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z) noexcept; template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z) noexcept; template XSIMD_INLINE batch frexp(const batch& x, const batch, A>& e) noexcept; template XSIMD_INLINE batch horner(const batch& self) noexcept; template XSIMD_INLINE batch hypot(const batch& self) noexcept; template XSIMD_INLINE batch_bool is_even(batch const& self) noexcept; template XSIMD_INLINE batch_bool is_flint(batch const& self) noexcept; template XSIMD_INLINE batch_bool is_odd(batch const& self) noexcept; template XSIMD_INLINE typename batch::batch_bool_type isinf(batch const& self) noexcept; template XSIMD_INLINE typename batch::batch_bool_type isfinite(batch const& self) noexcept; template XSIMD_INLINE typename batch::batch_bool_type isnan(batch const& self) noexcept; template XSIMD_INLINE batch ldexp(const batch& x, const batch, A>& e) noexcept; template XSIMD_INLINE batch log(batch const& self) noexcept; template XSIMD_INLINE batch nearbyint(batch const& self) noexcept; template XSIMD_INLINE batch, A> nearbyint_as_int(const batch& x) noexcept; template XSIMD_INLINE T reduce_add(batch const&) noexcept; template XSIMD_INLINE batch select(batch_bool const&, batch const&, batch const&) noexcept; template XSIMD_INLINE batch, A> select(batch_bool const&, batch, A> const&, batch, A> const&) noexcept; template XSIMD_INLINE batch sign(batch const& self) noexcept; template XSIMD_INLINE batch signnz(batch const& self) noexcept; template XSIMD_INLINE batch sin(batch const& self) noexcept; template XSIMD_INLINE batch sinh(batch const& self) noexcept; template XSIMD_INLINE std::pair, batch> sincos(batch const& self) noexcept; template XSIMD_INLINE batch sqrt(batch const& self) noexcept; template XSIMD_INLINE batch tan(batch const& self) noexcept; template XSIMD_INLINE batch, A> to_float(batch const& self) noexcept; template XSIMD_INLINE batch, A> to_int(batch const& self) noexcept; template XSIMD_INLINE batch trunc(batch const& self) noexcept; namespace kernel { namespace detail { template XSIMD_INLINE batch apply(F&& func, batch const& self, batch const& other) noexcept { constexpr std::size_t size = batch::size; alignas(A::alignment()) T self_buffer[size]; alignas(A::alignment()) T other_buffer[size]; self.store_aligned(&self_buffer[0]); other.store_aligned(&other_buffer[0]); for (std::size_t i = 0; i < size; ++i) { self_buffer[i] = func(self_buffer[i], other_buffer[i]); } return batch::load_aligned(self_buffer); } template XSIMD_INLINE batch apply_transform(F&& func, batch const& self) noexcept { static_assert(batch::size == batch::size, "Source and destination sizes must match"); constexpr std::size_t src_size = batch::size; constexpr std::size_t dest_size = batch::size; alignas(A::alignment()) T self_buffer[src_size]; alignas(A::alignment()) U other_buffer[dest_size]; self.store_aligned(&self_buffer[0]); for (std::size_t i = 0; i < src_size; ++i) { other_buffer[i] = func(self_buffer[i]); } return batch::load_aligned(other_buffer); } } // some generic fast_cast conversion namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return bitwise_cast(self); } // Provide a generic uint32_t -> float cast only if we have a // non-generic int32_t -> float fast_cast template const&>(), std::declval const&>(), A {}))> XSIMD_INLINE batch fast_cast(batch const& v, batch const&, requires_arch) noexcept { // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse batch msk_lo(0xFFFF); batch cnst65536f(65536.0f); auto v_lo = batch_cast(v & msk_lo); /* extract the 16 lowest significant bits of self */ auto v_hi = batch_cast(v >> 16); /* 16 most significant bits of v */ auto v_lo_flt = batch_cast(v_lo); /* No rounding */ auto v_hi_flt = batch_cast(v_hi); /* No rounding */ v_hi_flt = cnst65536f * v_hi_flt; /* No rounding */ return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */ } // Provide a generic float -> uint32_t cast only if we have a // non-generic float -> int32_t fast_cast template const&>(), std::declval const&>(), A {}))> XSIMD_INLINE batch fast_cast(batch const& v, batch const&, requires_arch) noexcept { auto is_large = v >= batch(1u << 31); auto small_v = bitwise_cast(batch_cast(v)); auto large_v = bitwise_cast( batch_cast(v - batch(1u << 31)) ^ batch(1u << 31)); return bitwise_cast(select(is_large, large_v, small_v)); } } namespace detail { // Generic conversion handling machinery. Each architecture must define // conversion function when such conversions exits in the form of // intrinsic. Then we use that information to automatically decide whether // to use scalar or vector conversion when doing load / store / batch_cast struct with_fast_conversion { }; struct with_slow_conversion { }; template struct conversion_type_impl { using type = with_slow_conversion; }; using xsimd::detail::void_t; template struct conversion_type_impl&>(), std::declval&>(), std::declval()))>> { using type = with_fast_conversion; }; template using conversion_type = typename conversion_type_impl::type; } namespace detail { /* origin: boost/simdfunction/horn.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE B coef() noexcept { using value_type = typename B::value_type; return B(bit_cast(as_unsigned_integer_t(c))); } template XSIMD_INLINE B horner(const B&) noexcept { return B(typename B::value_type(0.)); } template XSIMD_INLINE B horner(const B&) noexcept { return coef(); } template XSIMD_INLINE B horner(const B& self) noexcept { return fma(self, horner(self), coef()); } /* origin: boost/simdfunction/horn1.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE B horner1(const B&) noexcept { return B(1.); } template XSIMD_INLINE B horner1(const B& x) noexcept { return x + detail::coef(); } template XSIMD_INLINE B horner1(const B& x) noexcept { return fma(x, horner1(x), detail::coef()); } } } } #endif xsimd-13.2.0/include/xsimd/arch/generic/xsimd_generic_logical.hpp000066400000000000000000000202351475736624100250650ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_LOGICAL_HPP #define XSIMD_GENERIC_LOGICAL_HPP #include "./xsimd_generic_details.hpp" #include namespace xsimd { namespace kernel { using namespace types; // count template XSIMD_INLINE size_t count(batch_bool const& self, requires_arch) noexcept { uint64_t m = self.mask(); XSIMD_IF_CONSTEXPR(batch_bool::size < 14) { // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64 return (m * 0x200040008001ULL & 0x111111111111111ULL) % 0xf; } else { #if defined __has_builtin #if __has_builtin(__builtin_popcountg) #define builtin_popcount(v) __builtin_popcountg(v) #endif #endif #ifdef builtin_popcount return builtin_popcount(m); #else // FIXME: we could do better by dispatching to the appropriate // popcount instruction depending on the arch... XSIMD_IF_CONSTEXPR(batch_bool::size <= 32) { uint32_t m32 = m; // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel m32 = m32 - ((m32 >> 1) & 0x55555555); // reuse input as temporary m32 = (m32 & 0x33333333) + ((m32 >> 2) & 0x33333333); // temp return (((m32 + (m32 >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; // count } else { // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel m = m - ((m >> 1) & (uint64_t) ~(uint64_t)0 / 3); // temp m = (m & (uint64_t) ~(uint64_t)0 / 15 * 3) + ((m >> 2) & (uint64_t) ~(uint64_t)0 / 15 * 3); // temp m = (m + (m >> 4)) & (uint64_t) ~(uint64_t)0 / 255 * 15; // temp return (m * ((uint64_t) ~(uint64_t)0 / 255)) >> (sizeof(uint64_t) - 1) * CHAR_BIT; // count } #endif } } // from mask template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) bool buffer[batch_bool::size]; // This is inefficient but should never be called. It's just a // temporary implementation until arm support is added. for (size_t i = 0; i < batch_bool::size; ++i) buffer[i] = mask & (1ull << i); return batch_bool::load_aligned(buffer); } // ge template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return other <= self; } // gt template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return other < self; } // is_even template XSIMD_INLINE batch_bool is_even(batch const& self, requires_arch) noexcept { return is_flint(self * T(0.5)); } // is_flint template XSIMD_INLINE batch_bool is_flint(batch const& self, requires_arch) noexcept { auto frac = select(isnan(self - self), constants::nan>(), self - trunc(self)); return frac == T(0.); } // is_odd template XSIMD_INLINE batch_bool is_odd(batch const& self, requires_arch) noexcept { return is_even(self - T(1.)); } // isinf template ::value, void>::type> XSIMD_INLINE batch_bool isinf(batch const&, requires_arch) noexcept { return batch_bool(false); } template XSIMD_INLINE batch_bool isinf(batch const& self, requires_arch) noexcept { return abs(self) == std::numeric_limits::infinity(); } template XSIMD_INLINE batch_bool isinf(batch const& self, requires_arch) noexcept { return abs(self) == std::numeric_limits::infinity(); } // isfinite template ::value, void>::type> XSIMD_INLINE batch_bool isfinite(batch const&, requires_arch) noexcept { return batch_bool(true); } template XSIMD_INLINE batch_bool isfinite(batch const& self, requires_arch) noexcept { return (self - self) == 0.f; } template XSIMD_INLINE batch_bool isfinite(batch const& self, requires_arch) noexcept { return (self - self) == 0.; } // isnan template ::value, void>::type> XSIMD_INLINE batch_bool isnan(batch const&, requires_arch) noexcept { return batch_bool(false); } // le template ::value, void>::type> XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return (self < other) || (self == other); } // neq template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return !(other == self); } // logical_and template XSIMD_INLINE batch logical_and(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x && y; }, self, other); } // logical_or template XSIMD_INLINE batch logical_or(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept { return x || y; }, self, other); } // mask template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { alignas(A::alignment()) bool buffer[batch_bool::size]; self.store_aligned(buffer); // This is inefficient but should never be called. It's just a // temporary implementation until arm support is added. uint64_t res = 0; for (size_t i = 0; i < batch_bool::size; ++i) if (buffer[i]) res |= 1ul << i; return res; } } } #endif xsimd-13.2.0/include/xsimd/arch/generic/xsimd_generic_math.hpp000066400000000000000000003404601475736624100244110ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_MATH_HPP #define XSIMD_GENERIC_MATH_HPP #include "../xsimd_scalar.hpp" #include "./xsimd_generic_details.hpp" #include "./xsimd_generic_trigo.hpp" #include namespace xsimd { namespace kernel { using namespace types; // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { if (std::is_unsigned::value) return self; else { auto sign = bitofsign(self); auto inv = self ^ sign; return inv - sign; } } template XSIMD_INLINE batch abs(batch, A> const& z, requires_arch) noexcept { return hypot(z.real(), z.imag()); } // avg namespace detail { template XSIMD_INLINE batch avg(batch const& x, batch const& y, std::true_type, std::false_type) noexcept { return (x & y) + ((x ^ y) >> 1); } template XSIMD_INLINE batch avg(batch const& x, batch const& y, std::true_type, std::true_type) noexcept { // Inspired by // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c auto t = (x & y) + ((x ^ y) >> 1); auto t_u = bitwise_cast::type>(t); auto avg = t + (bitwise_cast(t_u >> (8 * sizeof(T) - 1)) & (x ^ y)); return avg; } template XSIMD_INLINE batch avg(batch const& x, batch const& y, std::false_type, std::true_type) noexcept { return (x + y) / 2; } } template XSIMD_INLINE batch avg(batch const& x, batch const& y, requires_arch) noexcept { return detail::avg(x, y, typename std::is_integral::type {}, typename std::is_signed::type {}); } // avgr namespace detail { template XSIMD_INLINE batch avgr(batch const& x, batch const& y, std::true_type) noexcept { constexpr unsigned shift = 8 * sizeof(T) - 1; auto adj = std::is_signed::value ? ((x ^ y) & 0x1) : (((x ^ y) << shift) >> shift); return ::xsimd::kernel::avg(x, y, A {}) + adj; } template XSIMD_INLINE batch avgr(batch const& x, batch const& y, std::false_type) noexcept { return ::xsimd::kernel::avg(x, y, A {}); } } template XSIMD_INLINE batch avgr(batch const& x, batch const& y, requires_arch) noexcept { return detail::avgr(x, y, typename std::is_integral::type {}); } // batch_cast template XSIMD_INLINE batch batch_cast(batch const& self, batch const&, requires_arch) noexcept { return self; } namespace detail { template XSIMD_INLINE batch batch_cast(batch const& self, batch const& out, requires_arch, with_fast_conversion) noexcept { return fast_cast(self, out, A {}); } template XSIMD_INLINE batch batch_cast(batch const& self, batch const&, requires_arch, with_slow_conversion) noexcept { static_assert(!std::is_same::value, "there should be no conversion for this type combination"); using batch_type_in = batch; using batch_type_out = batch; static_assert(batch_type_in::size == batch_type_out::size, "compatible sizes"); alignas(A::alignment()) T_in buffer_in[batch_type_in::size]; alignas(A::alignment()) T_out buffer_out[batch_type_out::size]; self.store_aligned(&buffer_in[0]); std::copy(std::begin(buffer_in), std::end(buffer_in), std::begin(buffer_out)); return batch_type_out::load_aligned(buffer_out); } } template XSIMD_INLINE batch batch_cast(batch const& self, batch const& out, requires_arch) noexcept { return detail::batch_cast(self, out, A {}, detail::conversion_type {}); } // bitofsign template XSIMD_INLINE batch bitofsign(batch const& self, requires_arch) noexcept { static_assert(std::is_integral::value, "int type implementation"); if (std::is_unsigned::value) return batch(0); else return self >> (T)(8 * sizeof(T) - 1); } template XSIMD_INLINE batch bitofsign(batch const& self, requires_arch) noexcept { return self & constants::signmask>(); } template XSIMD_INLINE batch bitofsign(batch const& self, requires_arch) noexcept { return self & constants::signmask>(); } // bitwise_cast template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return self; } // cbrt /* origin: boost/simd/arch/common/simd/function/cbrt.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch cbrt(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type z = abs(self); #ifndef XSIMD_NO_DENORMALS auto denormal = z < constants::smallestposval(); z = select(denormal, z * constants::twotonmb(), z); batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); #endif const batch_type CBRT2(bit_cast(0x3fa14518)); const batch_type CBRT4(bit_cast(0x3fcb2ff5)); const batch_type CBRT2I(bit_cast(0x3f4b2ff5)); const batch_type CBRT4I(bit_cast(0x3f214518)); using i_type = as_integer_t; i_type e; batch_type x = frexp(z, e); x = detail::horner(x); auto flag = e >= i_type(0); i_type e1 = abs(e); i_type rem = e1; e1 /= i_type(3); rem -= e1 * i_type(3); e = e1 * sign(e); const batch_type cbrt2 = select(batch_bool_cast(flag), CBRT2, CBRT2I); const batch_type cbrt4 = select(batch_bool_cast(flag), CBRT4, CBRT4I); batch_type fact = select(batch_bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); fact = select(batch_bool_cast(rem == i_type(2)), cbrt4, fact); x = ldexp(x * fact, e); x -= (x - z / (x * x)) * batch_type(1.f / 3.f); #ifndef XSIMD_NO_DENORMALS x = (x | bitofsign(self)) * f; #else x = x | bitofsign(self); #endif #ifndef XSIMD_NO_INFINITIES return select(self == batch_type(0.) || isinf(self), self, x); #else return select(self == batch_type(0.), self, x); #endif } template XSIMD_INLINE batch cbrt(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type z = abs(self); #ifndef XSIMD_NO_DENORMALS auto denormal = z < constants::smallestposval(); z = select(denormal, z * constants::twotonmb(), z); batch_type f = select(denormal, constants::twotonmbo3(), batch_type(1.)); #endif const batch_type CBRT2(bit_cast(int64_t(0x3ff428a2f98d728b))); const batch_type CBRT4(bit_cast(int64_t(0x3ff965fea53d6e3d))); const batch_type CBRT2I(bit_cast(int64_t(0x3fe965fea53d6e3d))); const batch_type CBRT4I(bit_cast(int64_t(0x3fe428a2f98d728b))); using i_type = as_integer_t; i_type e; batch_type x = frexp(z, e); x = detail::horner(x); auto flag = e >= typename i_type::value_type(0); i_type e1 = abs(e); i_type rem = e1; e1 /= i_type(3); rem -= e1 * i_type(3); e = e1 * sign(e); const batch_type cbrt2 = select(batch_bool_cast(flag), CBRT2, CBRT2I); const batch_type cbrt4 = select(batch_bool_cast(flag), CBRT4, CBRT4I); batch_type fact = select(batch_bool_cast(rem == i_type(1)), cbrt2, batch_type(1.)); fact = select(batch_bool_cast(rem == i_type(2)), cbrt4, fact); x = ldexp(x * fact, e); x -= (x - z / (x * x)) * batch_type(1. / 3.); x -= (x - z / (x * x)) * batch_type(1. / 3.); #ifndef XSIMD_NO_DENORMALS x = (x | bitofsign(self)) * f; #else x = x | bitofsign(self); #endif #ifndef XSIMD_NO_INFINITIES return select(self == batch_type(0.) || isinf(self), self, x); #else return select(self == batch_type(0.), self, x); #endif } // clip template XSIMD_INLINE batch clip(batch const& self, batch const& lo, batch const& hi, requires_arch) noexcept { return min(hi, max(self, lo)); } // copysign template ::value, void>::type> XSIMD_INLINE batch copysign(batch const& self, batch const& other, requires_arch) noexcept { return abs(self) | bitofsign(other); } // erf namespace detail { /* origin: boost/simd/arch/common/detail/generic/erf_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct erf_kernel; template struct erf_kernel> { using batch_type = batch; // computes erf(a0)/a0 // x is sqr(a0) and 0 <= abs(a0) <= 2/3 static XSIMD_INLINE batch_type erf1(const batch_type& x) noexcept { return detail::horner(x); } // computes erfc(x)*exp(sqr(x)) // x >= 2/3 static XSIMD_INLINE batch_type erfc2(const batch_type& x) noexcept { return detail::horner(x); } static XSIMD_INLINE batch_type erfc3(const batch_type& x) noexcept { return (batch_type(1.) - x) * detail::horner(x); } }; template struct erf_kernel> { using batch_type = batch; // computes erf(a0)/a0 // x is sqr(a0) and 0 <= abs(a0) <= 0.65 static XSIMD_INLINE batch_type erf1(const batch_type& x) noexcept { return detail::horner(x) / detail::horner(x); } // computes erfc(x)*exp(x*x) // 0.65 <= abs(x) <= 2.2 static XSIMD_INLINE batch_type erfc2(const batch_type& x) noexcept { return detail::horner(x) / detail::horner(x); } // computes erfc(x)*exp(x*x) // 2.2 <= abs(x) <= 6 static XSIMD_INLINE batch_type erfc3(const batch_type& x) noexcept { return detail::horner(x) / detail::horner(x); } // computes erfc(rx)*exp(rx*rx) // x >= 6 rx = 1/x static XSIMD_INLINE batch_type erfc4(const batch_type& x) noexcept { return detail::horner(x); } }; } /* origin: boost/simd/arch/common/simd/function/erf.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch erf(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type r1(0.); auto test1 = x < batch_type(2.f / 3.f); if (any(test1)) { r1 = self * detail::erf_kernel::erf1(x * x); if (all(test1)) return r1; } batch_type z = x / (batch_type(1.) + x); z -= batch_type(0.4f); batch_type r2 = batch_type(1.) - exp(-x * x) * detail::erf_kernel::erfc2(z); r2 = select(self < batch_type(0.), -r2, r2); r1 = select(test1, r1, r2); #ifndef XSIMD_NO_INFINITIES r1 = select(xsimd::isinf(self), sign(self), r1); #endif return r1; } template XSIMD_INLINE batch erf(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type xx = x * x; batch_type lim1(0.65); batch_type lim2(2.2); auto test1 = x < lim1; batch_type r1(0.); if (any(test1)) { r1 = self * detail::erf_kernel::erf1(xx); if (all(test1)) return r1; } auto test2 = x < lim2; auto test3 = test2 && !test1; batch_type ex = exp(-xx); if (any(test3)) { batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc2(x); batch_type r2 = select(self < batch_type(0.), -z, z); r1 = select(test1, r1, r2); if (all(test1 || test3)) return r1; } batch_type z = batch_type(1.) - ex * detail::erf_kernel::erfc3(x); z = select(self < batch_type(0.), -z, z); #ifndef XSIMD_NO_INFINITIES z = select(xsimd::isinf(self), sign(self), z); #endif return select(test2, r1, z); } // erfc template XSIMD_INLINE batch erfc(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto test0 = self < batch_type(0.); batch_type r1(0.); auto test1 = 3.f * x < 2.f; batch_type z = x / (batch_type(1.) + x); if (any(test1)) { r1 = detail::erf_kernel::erfc3(z); if (all(test1)) return select(test0, batch_type(2.) - r1, r1); } z -= batch_type(0.4f); batch_type r2 = exp(-x * x) * detail::erf_kernel::erfc2(z); r1 = select(test1, r1, r2); #ifndef XSIMD_NO_INFINITIES r1 = select(x == constants::infinity(), batch_type(0.), r1); #endif return select(test0, batch_type(2.) - r1, r1); } template XSIMD_INLINE batch erfc(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type xx = x * x; batch_type lim1(0.65); batch_type lim2(2.2); auto test0 = self < batch_type(0.); auto test1 = x < lim1; batch_type r1(0.); if (any(test1)) { r1 = batch_type(1.) - x * detail::erf_kernel::erf1(xx); if (all(test1)) return select(test0, batch_type(2.) - r1, r1); } auto test2 = x < lim2; auto test3 = test2 && !test1; batch_type ex = exp(-xx); if (any(test3)) { batch_type z = ex * detail::erf_kernel::erfc2(x); r1 = select(test1, r1, z); if (all(test1 || test3)) return select(test0, batch_type(2.) - r1, r1); } batch_type z = ex * detail::erf_kernel::erfc3(x); r1 = select(test2, r1, z); #ifndef XSIMD_NO_INFINITIES r1 = select(x == constants::infinity(), batch_type(0.), r1); #endif return select(test0, batch_type(2.) - r1, r1); } // estrin namespace detail { template struct estrin { B x; template XSIMD_INLINE B operator()(const Ts&... coefs) noexcept { return eval(coefs...); } private: XSIMD_INLINE B eval(const B& c0) noexcept { return c0; } XSIMD_INLINE B eval(const B& c0, const B& c1) noexcept { return fma(x, c1, c0); } template XSIMD_INLINE B eval(::xsimd::detail::index_sequence, const Tuple& tuple) { return estrin { x * x }(std::get(tuple)...); } template XSIMD_INLINE B eval(const std::tuple& tuple) noexcept { return eval(::xsimd::detail::make_index_sequence(), tuple); } template XSIMD_INLINE B eval(const std::tuple& tuple, const B& c0) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0)))); } template XSIMD_INLINE B eval(const std::tuple& tuple, const B& c0, const B& c1) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1)))); } template XSIMD_INLINE B eval(const std::tuple& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept { return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...); } template XSIMD_INLINE B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept { return eval(std::make_tuple(eval(c0, c1)), coefs...); } }; } template XSIMD_INLINE batch estrin(const batch& self) noexcept { using batch_type = batch; return detail::estrin { self }(detail::coef()...); } // exp /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { enum exp_reduction_tag { exp_tag, exp2_tag, exp10_tag }; template struct exp_reduction_base; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return constants::maxlog(); } static constexpr B minlog() noexcept { return constants::minlog(); } }; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return constants::maxlog10(); } static constexpr B minlog() noexcept { return constants::minlog10(); } }; template struct exp_reduction_base { static constexpr B maxlog() noexcept { return constants::maxlog2(); } static constexpr B minlog() noexcept { return constants::minlog2(); } }; template struct exp_reduction; template struct exp_reduction : exp_reduction_base, exp_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { batch_type y = detail::horner(x); return ++fma(y, x * x, x); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog_2() * a); x = fnma(k, constants::log_2hi(), a); x = fnma(k, constants::log_2lo(), x); return k; } }; template struct exp_reduction : exp_reduction_base, exp10_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { return ++(detail::horner(x) * x); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog10_2() * a); x = fnma(k, constants::log10_2hi(), a); x -= k * constants::log10_2lo(); return k; } }; template struct exp_reduction : exp_reduction_base, exp2_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { batch_type y = detail::horner(x); return ++fma(y, x * x, x * constants::log_2()); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& x) noexcept { batch_type k = nearbyint(a); x = (a - k); return k; } }; template struct exp_reduction : exp_reduction_base, exp_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { batch_type t = x * x; return fnma(t, detail::horner(t), x); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog_2() * a); hi = fnma(k, constants::log_2hi(), a); lo = k * constants::log_2lo(); x = hi - lo; return k; } static XSIMD_INLINE batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) noexcept { return batch_type(1.) - (((lo - (x * c) / (batch_type(2.) - c)) - hi)); } }; template struct exp_reduction : exp_reduction_base, exp10_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { batch_type xx = x * x; batch_type px = x * detail::horner(xx); batch_type x2 = px / (detail::horner1(xx) - px); return ++(x2 + x2); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept { batch_type k = nearbyint(constants::invlog10_2() * a); x = fnma(k, constants::log10_2hi(), a); x = fnma(k, constants::log10_2lo(), x); return k; } static XSIMD_INLINE batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) noexcept { return c; } }; template struct exp_reduction : exp_reduction_base, exp2_tag> { using batch_type = batch; static XSIMD_INLINE batch_type approx(const batch_type& x) noexcept { batch_type t = x * x; return fnma(t, detail::horner(t), x); } static XSIMD_INLINE batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept { batch_type k = nearbyint(a); x = (a - k) * constants::log_2(); return k; } static XSIMD_INLINE batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) noexcept { return batch_type(1.) + x + x * c / (batch_type(2.) - c); } }; template XSIMD_INLINE batch exp(batch const& self) noexcept { using batch_type = batch; using reducer_t = exp_reduction; batch_type x; batch_type k = reducer_t::reduce(self, x); x = reducer_t::approx(x); x = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(x, to_int(k))); x = select(self >= reducer_t::maxlog(), constants::infinity(), x); return x; } template XSIMD_INLINE batch exp(batch const& self) noexcept { using batch_type = batch; using reducer_t = exp_reduction; batch_type hi, lo, x; batch_type k = reducer_t::reduce(self, hi, lo, x); batch_type c = reducer_t::approx(x); c = reducer_t::finalize(x, c, hi, lo); c = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(c, to_int(k))); c = select(self >= reducer_t::maxlog(), constants::infinity(), c); return c; } } template XSIMD_INLINE batch exp(batch const& self, requires_arch) noexcept { return detail::exp(self); } template XSIMD_INLINE batch, A> exp(batch, A> const& self, requires_arch) noexcept { using batch_type = batch, A>; auto isincos = sincos(self.imag()); return exp(self.real()) * batch_type(std::get<1>(isincos), std::get<0>(isincos)); } // exp10 template XSIMD_INLINE batch exp10(batch const& self, requires_arch) noexcept { return detail::exp(self); } // exp2 template XSIMD_INLINE batch exp2(batch const& self, requires_arch) noexcept { return detail::exp(self); } // expm1 namespace detail { /* origin: boost/simd/arch/common/detail/generic/expm1_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template static XSIMD_INLINE batch expm1(const batch& a) noexcept { using batch_type = batch; batch_type k = nearbyint(constants::invlog_2() * a); batch_type x = fnma(k, constants::log_2hi(), a); x = fnma(k, constants::log_2lo(), x); batch_type hx = x * batch_type(0.5); batch_type hxs = x * hx; batch_type r = detail::horner(hxs); batch_type t = fnma(r, hx, batch_type(3.)); batch_type e = hxs * ((r - t) / (batch_type(6.) - x * t)); e = fms(x, e, hxs); using i_type = as_integer_t; i_type ik = to_int(k); batch_type two2mk = ::xsimd::bitwise_cast((constants::maxexponent() - ik) << constants::nmb()); batch_type y = batch_type(1.) - two2mk - (e - x); return ldexp(y, ik); } template static XSIMD_INLINE batch expm1(const batch& a) noexcept { using batch_type = batch; batch_type k = nearbyint(constants::invlog_2() * a); batch_type hi = fnma(k, constants::log_2hi(), a); batch_type lo = k * constants::log_2lo(); batch_type x = hi - lo; batch_type hxs = x * x * batch_type(0.5); batch_type r = detail::horner(hxs); batch_type t = batch_type(3.) - r * batch_type(0.5) * x; batch_type e = hxs * ((r - t) / (batch_type(6) - x * t)); batch_type c = (hi - x) - lo; e = (x * (e - c) - c) - hxs; using i_type = as_integer_t; i_type ik = to_int(k); batch_type two2mk = ::xsimd::bitwise_cast((constants::maxexponent() - ik) << constants::nmb()); batch_type ct1 = batch_type(1.) - two2mk - (e - x); batch_type ct2 = ++(x - (e + two2mk)); batch_type y = select(k < batch_type(20.), ct1, ct2); return ldexp(y, ik); } } template XSIMD_INLINE batch expm1(batch const& self, requires_arch) noexcept { using batch_type = batch; return select(self < constants::logeps(), batch_type(-1.), select(self > constants::maxlog(), constants::infinity(), detail::expm1(self))); } template XSIMD_INLINE batch, A> expm1(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch isin = sin(z.imag()); real_batch rem1 = expm1(z.real()); real_batch re = rem1 + 1.; real_batch si = sin(z.imag() * 0.5); return { rem1 - 2. * re * si * si, re * isin }; } // polar template XSIMD_INLINE batch, A> polar(const batch& r, const batch& theta, requires_arch) noexcept { auto sincosTheta = sincos(theta); return { r * sincosTheta.second, r * sincosTheta.first }; } // fdim template XSIMD_INLINE batch fdim(batch const& self, batch const& other, requires_arch) noexcept { return fmax(batch(0), self - other); } // fmod template XSIMD_INLINE batch fmod(batch const& self, batch const& other, requires_arch) noexcept { return fnma(trunc(self / other), other, self); } // frexp /* origin: boost/simd/arch/common/simd/function/ifrexp.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch frexp(const batch& self, batch, A>& exp, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; i_type m1f = constants::mask1frexp(); i_type r1 = m1f & ::xsimd::bitwise_cast(self); batch_type x = self & ::xsimd::bitwise_cast(~m1f); exp = (r1 >> constants::nmb()) - constants::maxexponentm1(); exp = select(batch_bool_cast(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0))); return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast(constants::mask2frexp()), batch_type(0.)); } // from bool template XSIMD_INLINE batch from_bool(batch_bool const& self, requires_arch) noexcept { return batch(self.data) & batch(1); } // horner template XSIMD_INLINE batch horner(const batch& self) noexcept { return detail::horner, Coefs...>(self); } // hypot template XSIMD_INLINE batch hypot(batch const& self, batch const& other, requires_arch) noexcept { return sqrt(fma(self, self, other * other)); } // ipow template XSIMD_INLINE batch ipow(batch const& self, ITy other, requires_arch) noexcept { return ::xsimd::detail::ipow(self, other); } // ldexp /* origin: boost/simd/arch/common/simd/function/ldexp.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { using batch_type = batch; using itype = as_integer_t; itype ik = other + constants::maxexponent(); ik = ik << constants::nmb(); return self * ::xsimd::bitwise_cast(ik); } // lgamma template XSIMD_INLINE batch lgamma(batch const& self, requires_arch) noexcept; namespace detail { /* origin: boost/simd/arch/common/detail/generic/gammaln_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template static XSIMD_INLINE batch gammalnB(const batch& x) noexcept { return horner, 0x3ed87730, // 4.227843421859038E-001 0x3ea51a64, // 3.224669577325661E-001, 0xbd89f07e, // -6.735323259371034E-002, 0x3ca89ed8, // 2.058355474821512E-002, 0xbbf164fd, // -7.366775108654962E-003, 0x3b3ba883, // 2.863437556468661E-003, 0xbaabeab1, // -1.311620815545743E-003, 0x3a1ebb94 // 6.055172732649237E-004 >(x); } template static XSIMD_INLINE batch gammalnC(const batch& x) noexcept { return horner, 0xbf13c468, // -5.772156501719101E-001 0x3f528d34, // 8.224670749082976E-001, 0xbecd27a8, // -4.006931650563372E-001, 0x3e8a898b, // 2.705806208275915E-001, 0xbe53c04f, // -2.067882815621965E-001, 0x3e2d4dab, // 1.692415923504637E-001, 0xbe22d329, // -1.590086327657347E-001, 0x3e0c3c4f // 1.369488127325832E-001 >(x); } template static XSIMD_INLINE batch gammaln2(const batch& x) noexcept { return horner, 0x3daaaa94, // 8.333316229807355E-002f 0xbb358701, // -2.769887652139868E-003f, 0x3a31fd69 // 6.789774945028216E-004f >(x); } template static XSIMD_INLINE batch gammaln1(const batch& x) noexcept { return horner, 0xc12a0c675418055eull, // -8.53555664245765465627E5 0xc13a45890219f20bull, // -1.72173700820839662146E6, 0xc131bc82f994db51ull, // -1.16237097492762307383E6, 0xc1143d73f89089e5ull, // -3.31612992738871184744E5, 0xc0e2f234355bb93eull, // -3.88016315134637840924E4, 0xc09589018ff36761ull // -1.37825152569120859100E3 >(x) / horner, 0xc13ece4b6a11e14aull, // -2.01889141433532773231E6 0xc1435255892ff34cull, // -2.53252307177582951285E6, 0xc131628671950043ull, // -1.13933444367982507207E6, 0xc10aeb84b9744c9bull, // -2.20528590553854454839E5, 0xc0d0aa0d7b89d757ull, // -1.70642106651881159223E4, 0xc075fd0d1cf312b2ull, // -3.51815701436523470549E2, 0x3ff0000000000000ull // 1.00000000000000000000E0 >(x); } template static XSIMD_INLINE batch gammalnA(const batch& x) noexcept { return horner, 0x3fb555555555554bull, // 8.33333333333331927722E-2 0xbf66c16c16b0a5a1ull, // -2.77777777730099687205E-3, 0x3f4a019f20dc5ebbull, // 7.93650340457716943945E-4, 0xbf437fbdb580e943ull, // -5.95061904284301438324E-4, 0x3f4a985027336661ull // 8.11614167470508450300E-4 >(x); } /* origin: boost/simd/arch/common/simd/function/gammaln.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct lgamma_impl; template struct lgamma_impl> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& a) noexcept { auto inf_result = (a <= batch_type(0.)) && is_flint(a); batch_type x = select(inf_result, constants::nan(), a); batch_type q = abs(x); #ifndef XSIMD_NO_INFINITIES inf_result = (x == constants::infinity()) || inf_result; #endif auto ltza = a < batch_type(0.); batch_type r(0); batch_type r1 = other(q); if (any(ltza)) { r = select(inf_result, constants::infinity(), negative(q, r1)); if (all(ltza)) return r; } batch_type r2 = select(ltza, r, r1); return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); } private: static XSIMD_INLINE batch_type negative(const batch_type& q, const batch_type& w) noexcept { batch_type p = floor(q); batch_type z = q - p; auto test2 = z < batch_type(0.5); z = select(test2, z - batch_type(1.), z); z = q * sin(z, trigo_pi_tag()); return -log(constants::invpi() * abs(z)) - w; } static XSIMD_INLINE batch_type other(const batch_type& x) noexcept { auto xlt650 = (x < batch_type(6.5)); batch_type r0x = x; batch_type r0z = x; batch_type r0s = batch_type(1.); batch_type r1 = batch_type(0.); batch_type p = constants::nan(); if (any(xlt650)) { batch_type z = batch_type(1.); batch_type tx = select(xlt650, x, batch_type(0.)); batch_type nx = batch_type(0.); const batch_type _075 = batch_type(0.75); const batch_type _150 = batch_type(1.50); const batch_type _125 = batch_type(1.25); const batch_type _250 = batch_type(2.50); auto xge150 = (x >= _150); auto txgt250 = (tx > _250); // x >= 1.5 while (any(xge150 && txgt250)) { nx = select(txgt250, nx - batch_type(1.), nx); tx = select(txgt250, x + nx, tx); z = select(txgt250, z * tx, z); txgt250 = (tx > _250); } r0x = select(xge150, x + nx - batch_type(2.), x); r0z = select(xge150, z, r0z); r0s = select(xge150, batch_type(1.), r0s); // x >= 1.25 && x < 1.5 auto xge125 = (x >= _125); auto xge125t = xge125 && !xge150; if (any(xge125)) { r0x = select(xge125t, x - batch_type(1.), r0x); r0z = select(xge125t, z * x, r0z); r0s = select(xge125t, batch_type(-1.), r0s); } // x >= 0.75 && x < 1.5 batch_bool kernelC(false); auto xge075 = (x >= _075); auto xge075t = xge075 && !xge125; if (any(xge075t)) { kernelC = xge075t; r0x = select(xge075t, x - batch_type(1.), x); r0z = select(xge075t, batch_type(1.), r0z); r0s = select(xge075t, batch_type(-1.), r0s); p = gammalnC(r0x); } // tx < 1.5 && x < 0.75 auto txlt150 = (tx < _150) && !xge075; if (any(txlt150)) { auto orig = txlt150; while (any(txlt150)) { z = select(txlt150, z * tx, z); nx = select(txlt150, nx + batch_type(1.), nx); tx = select(txlt150, x + nx, tx); txlt150 = (tx < _150) && !xge075; } r0x = select(orig, r0x + nx - batch_type(2.), r0x); r0z = select(orig, z, r0z); r0s = select(orig, batch_type(-1.), r0s); } p = select(kernelC, p, gammalnB(r0x)); if (all(xlt650)) return fma(r0x, p, r0s * log(abs(r0z))); } r0z = select(xlt650, abs(r0z), x); batch_type m = log(r0z); r1 = fma(r0x, p, r0s * m); batch_type r2 = fma(x - batch_type(0.5), m, constants::logsqrt2pi() - x); r2 += gammaln2(batch_type(1.) / (x * x)) / x; return select(xlt650, r1, r2); } }; template struct lgamma_impl> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& a) noexcept { auto inf_result = (a <= batch_type(0.)) && is_flint(a); batch_type x = select(inf_result, constants::nan(), a); batch_type q = abs(x); #ifndef XSIMD_NO_INFINITIES inf_result = (q == constants::infinity()); #endif auto test = (a < batch_type(-34.)); batch_type r = constants::nan(); if (any(test)) { r = large_negative(q); if (all(test)) return select(inf_result, constants::nan(), r); } batch_type r1 = other(a); batch_type r2 = select(test, r, r1); return select(a == constants::minusinfinity(), constants::nan(), select(inf_result, constants::infinity(), r2)); } private: // FIXME: cannot mark this one as XSIMD_INLINE because there's a // recursive loop on `lgamma'. static inline batch_type large_negative(const batch_type& q) noexcept { batch_type w = lgamma(q); batch_type p = floor(q); batch_type z = q - p; auto test2 = (z < batch_type(0.5)); z = select(test2, z - batch_type(1.), z); z = q * sin(z, trigo_pi_tag()); z = abs(z); return constants::logpi() - log(z) - w; } static XSIMD_INLINE batch_type other(const batch_type& xx) noexcept { batch_type x = xx; auto test = (x < batch_type(13.)); batch_type r1 = batch_type(0.); if (any(test)) { batch_type z = batch_type(1.); batch_type p = batch_type(0.); batch_type u = select(test, x, batch_type(0.)); auto test1 = (u >= batch_type(3.)); while (any(test1)) { p = select(test1, p - batch_type(1.), p); u = select(test1, x + p, u); z = select(test1, z * u, z); test1 = (u >= batch_type(3.)); } auto test2 = (u < batch_type(2.)); while (any(test2)) { z = select(test2, z / u, z); p = select(test2, p + batch_type(1.), p); u = select(test2, x + p, u); test2 = (u < batch_type(2.)); } z = abs(z); x += p - batch_type(2.); r1 = x * gammaln1(x) + log(z); if (all(test)) return r1; } batch_type r2 = fma(xx - batch_type(0.5), log(xx), constants::logsqrt2pi() - xx); batch_type p = batch_type(1.) / (xx * xx); r2 += gammalnA(p) / xx; return select(test, r1, r2); } }; } template XSIMD_INLINE batch lgamma(batch const& self, requires_arch) noexcept { return detail::lgamma_impl>::compute(self); } // log /* origin: boost/simd/arch/common/simd/function/log.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch log(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(23), k); x = select(test, x * batch_type(8388608ul), x); } #endif i_type ix = ::xsimd::bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = ::xsimd::bitwise_cast(ix); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } template XSIMD_INLINE batch log(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type hx = ::xsimd::bitwise_cast(x) >> 32; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(54), k); x = select(test, x * batch_type(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; batch_type dk = to_float(k); hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); batch_type f = --x; batch_type hfsq = batch_type(0.5) * f * f; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo()) - hfsq + f); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } template XSIMD_INLINE batch, A> log(const batch, A>& z, requires_arch) noexcept { return batch, A>(log(abs(z)), atan2(z.imag(), z.real())); } // log2 template XSIMD_INLINE batch log2(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(25), k); x = select(test, x * batch_type(33554432ul), x); } #endif i_type ix = ::xsimd::bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = ::xsimd::bitwise_cast(ix); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t1 + t2; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2(), dk); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } template XSIMD_INLINE batch log2(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type hx = ::xsimd::bitwise_cast(x) >> 32; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(54), k); x = select(test, x * batch_type(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type hi = f - hfsq; hi = hi & ::xsimd::bitwise_cast((constants::allbits() << 32)); batch_type lo = fma(s, hfsq + R, f - hi - hfsq); batch_type val_hi = hi * constants::invlog_2hi(); batch_type val_lo = fma(lo + hi, constants::invlog_2lo(), lo * constants::invlog_2hi()); batch_type dk = to_float(k); batch_type w1 = dk + val_hi; val_lo += (dk - w1) + val_hi; val_hi = w1; batch_type r = val_lo + val_hi; #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } namespace detail { template XSIMD_INLINE batch logN_complex_impl(const batch& z, typename batch::value_type base) noexcept { using batch_type = batch; using rv_type = typename batch_type::value_type; return log(z) / batch_type(rv_type(base)); } } template XSIMD_INLINE batch, A> log2(batch, A> const& self, requires_arch) noexcept { return detail::logN_complex_impl(self, std::log(2)); } // log10 /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */ /* * ==================================================== * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. * * Developed at SunPro, a Sun Microsystems, Inc. business. * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice * is preserved. * ==================================================== */ template XSIMD_INLINE batch log10(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type ivln10hi(4.3432617188e-01f), ivln10lo(-3.1689971365e-05f), log10_2hi(3.0102920532e-01f), log10_2lo(7.9034151668e-07f); using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(25), k); x = select(test, x * batch_type(33554432ul), x); } #endif i_type ix = ::xsimd::bitwise_cast(x); ix += 0x3f800000 - 0x3f3504f3; k += (ix >> 23) - 0x7f; ix = (ix & i_type(0x007fffff)) + 0x3f3504f3; x = ::xsimd::bitwise_cast(ix); batch_type f = --x; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type dk = to_float(k); batch_type hfsq = batch_type(0.5) * f * f; batch_type hibits = f - hfsq; hibits &= ::xsimd::bitwise_cast(i_type(0xfffff000)); batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq); batch_type r = fma(dk, log10_2hi, fma(hibits, ivln10hi, fma(lobits, ivln10hi, fma(lobits + hibits, ivln10lo, dk * log10_2lo)))); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } template XSIMD_INLINE batch log10(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type ivln10hi(4.34294481878168880939e-01), ivln10lo(2.50829467116452752298e-11), log10_2hi(3.01029995663611771306e-01), log10_2lo(3.69423907715893078616e-13); using int_type = as_integer_t; using i_type = batch; batch_type x = self; i_type hx = ::xsimd::bitwise_cast(x) >> 32; i_type k(0); auto isnez = (self != batch_type(0.)); #ifndef XSIMD_NO_DENORMALS auto test = (self < constants::smallestposval()) && isnez; if (any(test)) { k = select(batch_bool_cast(test), k - i_type(54), k); x = select(test, x * batch_type(18014398509481984ull), x); } #endif hx += 0x3ff00000 - 0x3fe6a09e; k += (hx >> 20) - 0x3ff; hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e; x = ::xsimd::bitwise_cast(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast(x))); batch_type f = --x; batch_type dk = to_float(k); batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type hi = f - hfsq; hi = hi & ::xsimd::bitwise_cast(constants::allbits() << 32); batch_type lo = f - hi - hfsq + s * (hfsq + R); batch_type val_hi = hi * ivln10hi; batch_type y = dk * log10_2hi; batch_type val_lo = dk * log10_2lo + (lo + hi) * ivln10lo + lo * ivln10hi; batch_type w1 = y + val_hi; val_lo += (y - w1) + val_hi; val_hi = w1; batch_type r = val_lo + val_hi; #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(self >= batch_type(0.)), constants::nan(), zz); } template XSIMD_INLINE batch, A> log10(const batch, A>& z, requires_arch) noexcept { return detail::logN_complex_impl(z, std::log(10)); } // log1p /* origin: boost/simd/arch/common/simd/function/log1p.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch log1p(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; const batch_type uf = self + batch_type(1.); auto isnez = (uf != batch_type(0.)); i_type iu = ::xsimd::bitwise_cast(uf); iu += 0x3f800000 - 0x3f3504f3; i_type k = (iu >> 23) - 0x7f; iu = (iu & i_type(0x007fffff)) + 0x3f3504f3; batch_type f = --(::xsimd::bitwise_cast(iu)); batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type hfsq = batch_type(0.5) * f * f; batch_type dk = to_float(k); /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ batch_type c = select(batch_bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; batch_type r = fma(dk, constants::log_2hi(), fma(s, (hfsq + R), dk * constants::log_2lo() + c) - hfsq + f); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(uf >= batch_type(0.)), constants::nan(), zz); } template XSIMD_INLINE batch log1p(batch const& self, requires_arch) noexcept { using batch_type = batch; using int_type = as_integer_t; using i_type = batch; const batch_type uf = self + batch_type(1.); auto isnez = (uf != batch_type(0.)); i_type hu = ::xsimd::bitwise_cast(uf) >> 32; hu += 0x3ff00000 - 0x3fe6a09e; i_type k = (hu >> 20) - 0x3ff; /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */ batch_type c = select(batch_bool_cast(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf; hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e; batch_type f = ::xsimd::bitwise_cast((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast(uf))); f = --f; batch_type hfsq = batch_type(0.5) * f * f; batch_type s = f / (batch_type(2.) + f); batch_type z = s * s; batch_type w = z * z; batch_type t1 = w * detail::horner(w); batch_type t2 = z * detail::horner(w); batch_type R = t2 + t1; batch_type dk = to_float(k); batch_type r = fma(dk, constants::log_2hi(), fma(s, hfsq + R, dk * constants::log_2lo() + c) - hfsq + f); #ifndef XSIMD_NO_INFINITIES batch_type zz = select(isnez, select(self == constants::infinity(), constants::infinity(), r), constants::minusinfinity()); #else batch_type zz = select(isnez, r, constants::minusinfinity()); #endif return select(!(uf >= batch_type(0.)), constants::nan(), zz); } template XSIMD_INLINE batch, A> log1p(batch, A> const& self, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; batch_type u = 1 + self; batch_type logu = log(u); return select(u == batch_type(1.), self, select(u.real() <= real_batch(0.), logu, logu * self / (u - batch_type(1.)))); } // mod template ::value, void>::type> XSIMD_INLINE batch mod(batch const& self, batch const& other, requires_arch) noexcept { return detail::apply([](T x, T y) noexcept -> T { return x % y; }, self, other); } // nearbyint template ::value, void>::type> XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return self; } namespace detail { template XSIMD_INLINE batch nearbyintf(batch const& self) noexcept { using batch_type = batch; batch_type s = bitofsign(self); batch_type v = self ^ s; batch_type t2n = constants::twotonmb(); // Under fast-math, reordering is possible and the compiler optimizes d // to v. That's not what we want, so prevent compiler optimization here. // FIXME: it may be better to emit a memory barrier here (?). #ifdef __FAST_MATH__ volatile batch_type d0 = v + t2n; batch_type d = *(batch_type*)(void*)(&d0) - t2n; #else batch_type d0 = v + t2n; batch_type d = d0 - t2n; #endif return s ^ select(v < t2n, d, v); } } template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return detail::nearbyintf(self); } template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return detail::nearbyintf(self); } // nearbyint_as_int template ::value, void>::type> XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return self; } // nearbyint_as_int template XSIMD_INLINE batch, A> nearbyint_as_int(batch const& self, requires_arch) noexcept { using U = as_integer_t; return kernel::detail::apply_transform([](float x) noexcept -> U { return std::nearbyintf(x); }, self); } template XSIMD_INLINE batch, A> nearbyint_as_int(batch const& self, requires_arch) noexcept { using U = as_integer_t; return kernel::detail::apply_transform([](double x) noexcept -> U { return std::nearbyint(x); }, self); } // nextafter namespace detail { template ::value> struct nextafter_kernel { using batch_type = batch; static XSIMD_INLINE batch_type next(batch_type const& b) noexcept { return b; } static XSIMD_INLINE batch_type prev(batch_type const& b) noexcept { return b; } }; template struct bitwise_cast_batch; template struct bitwise_cast_batch { using type = batch; }; template struct bitwise_cast_batch { using type = batch; }; template struct nextafter_kernel { using batch_type = batch; using int_batch = typename bitwise_cast_batch::type; using int_type = typename int_batch::value_type; static XSIMD_INLINE batch_type next(const batch_type& b) noexcept { batch_type n = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) + int_type(1)); return select(b == constants::infinity(), b, n); } static XSIMD_INLINE batch_type prev(const batch_type& b) noexcept { batch_type p = ::xsimd::bitwise_cast(::xsimd::bitwise_cast(b) - int_type(1)); return select(b == constants::minusinfinity(), b, p); } }; } template XSIMD_INLINE batch nextafter(batch const& from, batch const& to, requires_arch) noexcept { using kernel = detail::nextafter_kernel; return select(from == to, from, select(to > from, kernel::next(from), kernel::prev(from))); } // pow /* origin: boost/simd/arch/common/simd/function/pow.hpp*/ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch pow(batch const& self, batch const& other, requires_arch) noexcept { using batch_type = batch; const auto zero = batch_type(0.); auto negself = self < zero; auto iszeropowpos = self == zero && other >= zero; auto adj_self = select(iszeropowpos, batch_type(1), abs(self)); batch_type z = exp(other * log(adj_self)); z = select(iszeropowpos, zero, z); z = select(is_odd(other) && negself, -z, z); auto invalid = negself && !(is_flint(other) || isinf(other)); return select(invalid, constants::nan(), z); } template XSIMD_INLINE batch, A> pow(const batch, A>& a, const batch, A>& z, requires_arch) noexcept { using cplx_batch = batch, A>; using real_batch = typename cplx_batch::real_batch; real_batch absa = abs(a); real_batch arga = arg(a); real_batch x = z.real(); real_batch y = z.imag(); real_batch r = pow(absa, x); real_batch theta = x * arga; real_batch ze(0); auto cond = (y == ze); r = select(cond, r, r * exp(-y * arga)); theta = select(cond, theta, theta + y * log(absa)); auto sincosTheta = xsimd::sincos(theta); return select(absa == ze, cplx_batch(ze), cplx_batch(r * sincosTheta.second, r * sincosTheta.first)); } template inline batch, A> pow(const batch, A>& a, const batch& z, requires_arch) noexcept { using cplx_batch = batch, A>; auto absa = abs(a); auto arga = arg(a); auto r = pow(absa, z); auto theta = z * arga; auto sincosTheta = xsimd::sincos(theta); return select(absa == 0, cplx_batch(0), cplx_batch(r * sincosTheta.second, r * sincosTheta.first)); } template inline batch, A> pow(const batch& a, const batch, A>& z, requires_arch) noexcept { return pow(batch, A> { a, batch {} }, z); } // reciprocal template ::value, void>::type> XSIMD_INLINE batch reciprocal(batch const& self, requires_arch) noexcept { using batch_type = batch; return div(batch_type(1), self); } // reduce_add template XSIMD_INLINE std::complex reduce_add(batch, A> const& self, requires_arch) noexcept { return { reduce_add(self.real()), reduce_add(self.imag()) }; } namespace detail { template struct split_high { static constexpr T get(T i, T) { return i >= N ? (i % 2) : i + N; } }; template XSIMD_INLINE T reduce(Op, batch const& self, std::integral_constant) noexcept { return self.get(0); } template XSIMD_INLINE T reduce(Op op, batch const& self, std::integral_constant) noexcept { using index_type = as_unsigned_integer_t; batch split = swizzle(self, make_batch_constant>()); return reduce(op, op(split, self), std::integral_constant()); } } // reduce_max template XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept { return detail::reduce([](batch const& x, batch const& y) { return max(x, y); }, self, std::integral_constant::size>()); } // reduce_min template XSIMD_INLINE T reduce_min(batch const& self, requires_arch) noexcept { return detail::reduce([](batch const& x, batch const& y) { return min(x, y); }, self, std::integral_constant::size>()); } // remainder template XSIMD_INLINE batch remainder(batch const& self, batch const& other, requires_arch) noexcept { return fnma(nearbyint(self / other), other, self); } template XSIMD_INLINE batch remainder(batch const& self, batch const& other, requires_arch) noexcept { return fnma(nearbyint(self / other), other, self); } template ::value, void>::type> XSIMD_INLINE batch remainder(batch const& self, batch const& other, requires_arch) noexcept { auto mod = self % other; return select(mod <= other / 2, mod, mod - other); } // select template XSIMD_INLINE batch, A> select(batch_bool const& cond, batch, A> const& true_br, batch, A> const& false_br, requires_arch) noexcept { return { select(cond, true_br.real(), false_br.real()), select(cond, true_br.imag(), false_br.imag()) }; } // sign template ::value, void>::type> XSIMD_INLINE batch sign(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type res = select(self > batch_type(0), batch_type(1), batch_type(0)) - select(self < batch_type(0), batch_type(1), batch_type(0)); return res; } namespace detail { template XSIMD_INLINE batch signf(batch const& self) noexcept { using batch_type = batch; batch_type res = select(self > batch_type(0.f), batch_type(1.f), batch_type(0.f)) - select(self < batch_type(0.f), batch_type(1.f), batch_type(0.f)); #ifdef XSIMD_NO_NANS return res; #else return select(isnan(self), constants::nan(), res); #endif } } template XSIMD_INLINE batch sign(batch const& self, requires_arch) noexcept { return detail::signf(self); } template XSIMD_INLINE batch sign(batch const& self, requires_arch) noexcept { return detail::signf(self); } template XSIMD_INLINE batch, A> sign(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; auto rz = z.real(); auto iz = z.imag(); return select(rz != real_batch(0.), batch_type(sign(rz)), batch_type(sign(iz))); } // signnz template ::value, void>::type> XSIMD_INLINE batch signnz(batch const& self, requires_arch) noexcept { using batch_type = batch; return (self >> (sizeof(T) * 8 - 1)) | batch_type(1.); } namespace detail { template XSIMD_INLINE batch signnzf(batch const& self) noexcept { using batch_type = batch; #ifndef XSIMD_NO_NANS return select(isnan(self), constants::nan(), batch_type(1.) | (constants::signmask() & self)); #else return batch_type(1.) | (constants::signmask() & self); #endif } } template XSIMD_INLINE batch signnz(batch const& self, requires_arch) noexcept { return detail::signnzf(self); } template XSIMD_INLINE batch signnz(batch const& self, requires_arch) noexcept { return detail::signnzf(self); } // sqrt template XSIMD_INLINE batch, A> sqrt(batch, A> const& z, requires_arch) noexcept { constexpr T csqrt_scale_factor = std::is_same::value ? 6.7108864e7f : 1.8014398509481984e16; constexpr T csqrt_scale = std::is_same::value ? 1.220703125e-4f : 7.450580596923828125e-9; using batch_type = batch, A>; using real_batch = batch; real_batch x = z.real(); real_batch y = z.imag(); real_batch sqrt_x = sqrt(fabs(x)); real_batch sqrt_hy = sqrt(0.5 * fabs(y)); auto cond = (fabs(x) > real_batch(4.) || fabs(y) > real_batch(4.)); x = select(cond, x * 0.25, x * csqrt_scale_factor); y = select(cond, y * 0.25, y * csqrt_scale_factor); real_batch scale = select(cond, real_batch(2.), real_batch(csqrt_scale)); real_batch r = abs(batch_type(x, y)); auto condxp = x > real_batch(0.); real_batch t0 = select(condxp, xsimd::sqrt(0.5 * (r + x)), xsimd::sqrt(0.5 * (r - x))); real_batch r0 = scale * fabs((0.5 * y) / t0); t0 *= scale; real_batch t = select(condxp, t0, r0); r = select(condxp, r0, t0); batch_type resg = select(y < real_batch(0.), batch_type(t, -r), batch_type(t, r)); real_batch ze(0.); return select(y == ze, select(x == ze, batch_type(ze, ze), select(x < ze, batch_type(ze, sqrt_x), batch_type(sqrt_x, ze))), select(x == ze, select(y > ze, batch_type(sqrt_hy, sqrt_hy), batch_type(sqrt_hy, -sqrt_hy)), resg)); } // tgamma namespace detail { /* origin: boost/simd/arch/common/detail/generic/stirling_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct stirling_kernel; template struct stirling_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept { return horner(x); } static XSIMD_INLINE batch_type split_limit() noexcept { return batch_type(bit_cast(uint32_t(0x41d628f6))); } static XSIMD_INLINE batch_type large_limit() noexcept { return batch_type(bit_cast(uint32_t(0x420c28f3))); } }; template struct stirling_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept { return horner(x); } static XSIMD_INLINE batch_type split_limit() noexcept { return batch_type(bit_cast(uint64_t(0x4061e083ba3443d4))); } static XSIMD_INLINE batch_type large_limit() noexcept { return batch_type(bit_cast(uint64_t(0x4065800000000000))); } }; /* origin: boost/simd/arch/common/simd/function/stirling.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch stirling(const batch& a) noexcept { using batch_type = batch; const batch_type stirlingsplitlim = stirling_kernel::split_limit(); const batch_type stirlinglargelim = stirling_kernel::large_limit(); batch_type x = select(a >= batch_type(0.), a, constants::nan()); batch_type w = batch_type(1.) / x; w = fma(w, stirling_kernel::compute(w), batch_type(1.)); batch_type y = exp(-x); auto test = (x < stirlingsplitlim); batch_type z = x - batch_type(0.5); z = select(test, z, batch_type(0.5) * z); batch_type v = exp(z * log(abs(x))); y *= v; y = select(test, y, y * v); y *= constants::sqrt_2pi() * w; #ifndef XSIMD_NO_INFINITIES y = select(isinf(x), x, y); #endif return select(x > stirlinglargelim, constants::infinity(), y); } /* origin: boost/simd/arch/common/detail/generic/gamma_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct tgamma_kernel; template struct tgamma_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept { return horner(x); } }; template struct tgamma_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type compute(const batch_type& x) noexcept { return horner(x) / horner(x); } }; /* origin: boost/simd/arch/common/simd/function/gamma.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE B tgamma_large_negative(const B& a) noexcept { B st = stirling(a); B p = floor(a); B sgngam = select(is_even(p), -B(1.), B(1.)); B z = a - p; auto test2 = z < B(0.5); z = select(test2, z - B(1.), z); z = a * sin(z, trigo_pi_tag()); z = abs(z); return sgngam * constants::pi() / (z * st); } template XSIMD_INLINE B tgamma_other(const B& a, const BB& test) noexcept { B x = select(test, B(2.), a); #ifndef XSIMD_NO_INFINITIES auto inf_result = (a == constants::infinity()); x = select(inf_result, B(2.), x); #endif B z = B(1.); auto test1 = (x >= B(3.)); while (any(test1)) { x = select(test1, x - B(1.), x); z = select(test1, z * x, z); test1 = (x >= B(3.)); } test1 = (x < B(0.)); while (any(test1)) { z = select(test1, z / x, z); x = select(test1, x + B(1.), x); test1 = (x < B(0.)); } auto test2 = (x < B(2.)); while (any(test2)) { z = select(test2, z / x, z); x = select(test2, x + B(1.), x); test2 = (x < B(2.)); } x = z * tgamma_kernel::compute(x - B(2.)); #ifndef XSIMD_NO_INFINITIES return select(inf_result, a, x); #else return x; #endif } } template XSIMD_INLINE batch tgamma(batch const& self, requires_arch) noexcept { using batch_type = batch; auto nan_result = (self < batch_type(0.) && is_flint(self)); #ifndef XSIMD_NO_INVALIDS nan_result = isnan(self) || nan_result; #endif batch_type q = abs(self); auto test = (self < batch_type(-33.)); batch_type r = constants::nan(); if (any(test)) { r = detail::tgamma_large_negative(q); if (all(test)) return select(nan_result, constants::nan(), r); } batch_type r1 = detail::tgamma_other(self, test); batch_type r2 = select(test, r, r1); return select(self == batch_type(0.), copysign(constants::infinity(), self), select(nan_result, constants::nan(), r2)); } } } #endif xsimd-13.2.0/include/xsimd/arch/generic/xsimd_generic_memory.hpp000066400000000000000000001155241475736624100247710ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_MEMORY_HPP #define XSIMD_GENERIC_MEMORY_HPP #include #include #include #include "../../types/xsimd_batch_constant.hpp" #include "./xsimd_generic_details.hpp" namespace xsimd { template struct batch_constant; template struct batch_bool_constant; namespace kernel { using namespace types; // broadcast namespace detail { template struct broadcaster { using return_type = batch; static XSIMD_INLINE return_type run(T v) noexcept { return return_type::broadcast(v); } }; template struct broadcaster { using return_type = batch_bool, A>; static XSIMD_INLINE return_type run(bool b) noexcept { return return_type(b); } }; } // compress namespace detail { template XSIMD_INLINE batch create_compress_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence) { batch swizzle_mask(IT(0)); alignas(A::alignment()) IT mask_buffer[batch::size] = { Is... }; size_t inserted = 0; for (size_t i = 0; i < sizeof...(Is); ++i) if ((bitmask >> i) & 1u) std::swap(mask_buffer[inserted++], mask_buffer[i]); return batch::load_aligned(&mask_buffer[0]); } } template XSIMD_INLINE batch compress(batch const& x, batch_bool const& mask, kernel::requires_arch) noexcept { using IT = as_unsigned_integer_t; constexpr std::size_t size = batch_bool::size; auto bitmask = mask.mask(); auto z = select(mask, x, batch((T)0)); auto compress_mask = detail::create_compress_swizzle_mask(bitmask, ::xsimd::detail::make_index_sequence()); return swizzle(z, compress_mask); } // expand namespace detail { template XSIMD_INLINE batch create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence) { batch swizzle_mask(IT(0)); IT j = 0; (void)std::initializer_list { ((swizzle_mask = insert(swizzle_mask, j, index())), (j += ((bitmask >> Is) & 1u)), true)... }; return swizzle_mask; } } template XSIMD_INLINE batch expand(batch const& x, batch_bool const& mask, kernel::requires_arch) noexcept { constexpr std::size_t size = batch_bool::size; auto bitmask = mask.mask(); auto swizzle_mask = detail::create_expand_swizzle_mask, A>(bitmask, ::xsimd::detail::make_index_sequence()); auto z = swizzle(x, swizzle_mask); return select(mask, z, batch(T(0))); } // extract_pair template XSIMD_INLINE batch extract_pair(batch const& self, batch const& other, std::size_t i, requires_arch) noexcept { constexpr std::size_t size = batch::size; assert(i < size && "index in bounds"); alignas(A::alignment()) T self_buffer[size]; self.store_aligned(self_buffer); alignas(A::alignment()) T other_buffer[size]; other.store_aligned(other_buffer); alignas(A::alignment()) T concat_buffer[size]; for (std::size_t j = 0; j < (size - i); ++j) { concat_buffer[j] = other_buffer[i + j]; if (j < i) { concat_buffer[size - 1 - j] = self_buffer[i - 1 - j]; } } return batch::load_aligned(concat_buffer); } // gather namespace detail { // Not using XSIMD_INLINE here as it makes msvc hand got ever on avx512 template ::type = 0> inline batch gather(U const* src, batch const& index, ::xsimd::index I) noexcept { return insert(batch {}, static_cast(src[index.get(I)]), I); } template ::type = 0> inline batch gather(U const* src, batch const& index, ::xsimd::index I) noexcept { static_assert(N <= batch::size, "Incorrect value in recursion!"); const auto test = gather(src, index, {}); return insert(test, static_cast(src[index.get(I)]), I); } } // namespace detail template XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Index and destination sizes must match"); return detail::gather::size - 1, T, A>(src, index, {}); } // Gather with runtime indexes and mismatched strides. template XSIMD_INLINE detail::sizes_mismatch_t> gather(batch const&, U const* src, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Index and destination sizes must match"); return detail::gather::size - 1, T, A>(src, index, {}); } // Gather with runtime indexes and matching strides. template XSIMD_INLINE detail::stride_match_t> gather(batch const&, U const* src, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Index and destination sizes must match"); return batch_cast(kernel::gather(batch {}, src, index, A {})); } // insert template XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept { struct index_mask { static constexpr bool get(size_t index, size_t /* size*/) { return index != I; } }; batch tmp(val); return select(make_batch_bool_constant(), self, tmp); } // get template XSIMD_INLINE T get(batch const& self, ::xsimd::index, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch::size]; self.store_aligned(&buffer[0]); return buffer[I]; } template XSIMD_INLINE T get(batch_bool const& self, ::xsimd::index, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch_bool::size]; self.store_aligned(&buffer[0]); return buffer[I]; } template XSIMD_INLINE auto get(batch, A> const& self, ::xsimd::index, requires_arch) noexcept -> typename batch, A>::value_type { alignas(A::alignment()) T buffer[batch, A>::size]; self.store_aligned(&buffer[0]); return buffer[I]; } template XSIMD_INLINE T get(batch const& self, std::size_t i, requires_arch) noexcept { alignas(A::alignment()) T buffer[batch::size]; self.store_aligned(&buffer[0]); return buffer[i]; } template XSIMD_INLINE T get(batch_bool const& self, std::size_t i, requires_arch) noexcept { alignas(A::alignment()) bool buffer[batch_bool::size]; self.store_aligned(&buffer[0]); return buffer[i]; } template XSIMD_INLINE auto get(batch, A> const& self, std::size_t i, requires_arch) noexcept -> typename batch, A>::value_type { using T2 = typename batch, A>::value_type; alignas(A::alignment()) T2 buffer[batch, A>::size]; self.store_aligned(&buffer[0]); return buffer[i]; } // load_aligned namespace detail { template XSIMD_INLINE batch load_aligned(T_in const* mem, convert, requires_arch, with_fast_conversion) noexcept { using batch_type_in = batch; using batch_type_out = batch; return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {}); } template XSIMD_INLINE batch load_aligned(T_in const* mem, convert, requires_arch, with_slow_conversion) noexcept { static_assert(!std::is_same::value, "there should be a direct load for this type combination"); using batch_type_out = batch; alignas(A::alignment()) T_out buffer[batch_type_out::size]; std::copy(mem, mem + batch_type_out::size, std::begin(buffer)); return batch_type_out::load_aligned(buffer); } } template XSIMD_INLINE batch load_aligned(T_in const* mem, convert cvt, requires_arch) noexcept { return detail::load_aligned(mem, cvt, A {}, detail::conversion_type {}); } // load_unaligned namespace detail { template XSIMD_INLINE batch load_unaligned(T_in const* mem, convert, requires_arch, with_fast_conversion) noexcept { using batch_type_in = batch; using batch_type_out = batch; return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {}); } template XSIMD_INLINE batch load_unaligned(T_in const* mem, convert cvt, requires_arch, with_slow_conversion) noexcept { static_assert(!std::is_same::value, "there should be a direct load for this type combination"); return load_aligned(mem, cvt, generic {}, with_slow_conversion {}); } } template XSIMD_INLINE batch load_unaligned(T_in const* mem, convert cvt, requires_arch) noexcept { return detail::load_unaligned(mem, cvt, generic {}, detail::conversion_type {}); } // rotate_right template XSIMD_INLINE batch rotate_right(batch const& self, requires_arch) noexcept { struct rotate_generator { static constexpr size_t get(size_t index, size_t size) { return (index - N) % size; } }; return swizzle(self, make_batch_constant, A, rotate_generator>(), A {}); } template XSIMD_INLINE batch, A> rotate_right(batch, A> const& self, requires_arch) noexcept { return { rotate_right(self.real()), rotate_right(self.imag()) }; } // rotate_left template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { struct rotate_generator { static constexpr size_t get(size_t index, size_t size) { return (index + N) % size; } }; return swizzle(self, make_batch_constant, A, rotate_generator>(), A {}); } template XSIMD_INLINE batch, A> rotate_left(batch, A> const& self, requires_arch) noexcept { return { rotate_left(self.real()), rotate_left(self.imag()) }; } // Scatter with runtime indexes. namespace detail { template ::type = 0> XSIMD_INLINE void scatter(batch const& src, U* dst, batch const& index, ::xsimd::index I) noexcept { dst[index.get(I)] = static_cast(src.get(I)); } template ::type = 0> XSIMD_INLINE void scatter(batch const& src, U* dst, batch const& index, ::xsimd::index I) noexcept { static_assert(N <= batch::size, "Incorrect value in recursion!"); kernel::detail::scatter( src, dst, index, {}); dst[index.get(I)] = static_cast(src.get(I)); } } // namespace detail template XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Source and index sizes must match"); kernel::detail::scatter::size - 1, T, A, T, V>( src, dst, index, {}); } template XSIMD_INLINE detail::sizes_mismatch_t scatter(batch const& src, U* dst, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Source and index sizes must match"); kernel::detail::scatter::size - 1, T, A, U, V>( src, dst, index, {}); } template XSIMD_INLINE detail::stride_match_t scatter(batch const& src, U* dst, batch const& index, kernel::requires_arch) noexcept { static_assert(batch::size == batch::size, "Source and index sizes must match"); const auto tmp = batch_cast(src); kernel::scatter(tmp, dst, index, A {}); } // shuffle namespace detail { constexpr bool is_swizzle_fst(size_t) { return true; } template constexpr bool is_swizzle_fst(size_t bsize, ITy index, ITys... indices) { return index < bsize && is_swizzle_fst(bsize, indices...); } constexpr bool is_swizzle_snd(size_t) { return true; } template constexpr bool is_swizzle_snd(size_t bsize, ITy index, ITys... indices) { return index >= bsize && is_swizzle_snd(bsize, indices...); } constexpr bool is_zip_lo(size_t) { return true; } template constexpr bool is_zip_lo(size_t, ITy) { return false; } template constexpr bool is_zip_lo(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices) { return index0 == (bsize - (sizeof...(indices) + 2)) && index1 == (2 * bsize - (sizeof...(indices) + 2)) && is_zip_lo(bsize, indices...); } constexpr bool is_zip_hi(size_t) { return true; } template constexpr bool is_zip_hi(size_t, ITy) { return false; } template constexpr bool is_zip_hi(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices) { return index0 == (bsize / 2 + bsize - (sizeof...(indices) + 2)) && index1 == (bsize / 2 + 2 * bsize - (sizeof...(indices) + 2)) && is_zip_hi(bsize, indices...); } constexpr bool is_select(size_t) { return true; } template constexpr bool is_select(size_t bsize, ITy index, ITys... indices) { return (index < bsize ? index : index - bsize) == (bsize - sizeof...(ITys)) && is_select(bsize, indices...); } } template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept { constexpr size_t bsize = sizeof...(Indices); static_assert(bsize == batch::size, "valid shuffle"); // Detect common patterns XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...)) { return swizzle(x, batch_constant= bsize) ? 0 /* never happens */ : Indices)...>()); } XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...)) { return swizzle(y, batch_constant= bsize) ? (Indices - bsize) : 0 /* never happens */)...>()); } XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...)) { return zip_lo(x, y); } XSIMD_IF_CONSTEXPR(detail::is_zip_hi(bsize, Indices...)) { return zip_hi(x, y); } XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...)) { return select(batch_bool_constant(), x, y); } #if defined(__has_builtin) && !defined(XSIMD_WITH_EMULATED) #if __has_builtin(__builtin_shufflevector) #define builtin_shuffle __builtin_shufflevector #endif #endif #if defined(builtin_shuffle) typedef T vty __attribute__((__vector_size__(sizeof(batch)))); return (typename batch::register_type)builtin_shuffle((vty)x.data, (vty)y.data, Indices...); // FIXME: my experiments show that GCC only correctly optimizes this builtin // starting at GCC 13, where it already has __builtin_shuffle_vector // // #elif __has_builtin(__builtin_shuffle) || GCC >= 6 // typedef ITy integer_vector_type __attribute__((vector_size(sizeof(batch)))); // return __builtin_shuffle(x.data, y.data, integer_vector_type{Indices...}); #else // Use a generic_pattern. It is suboptimal but clang optimizes this // pretty well. batch x_lane = swizzle(x, batch_constant= bsize) ? (Indices - bsize) : Indices)...>()); batch y_lane = swizzle(y, batch_constant= bsize) ? (Indices - bsize) : Indices)...>()); batch_bool_constant select_x_lane; return select(select_x_lane, x_lane, y_lane); #endif } // store template XSIMD_INLINE void store(batch_bool const& self, bool* mem, requires_arch) noexcept { using batch_type = batch; constexpr auto size = batch_bool::size; alignas(A::alignment()) T buffer[size]; kernel::store_aligned(&buffer[0], batch_type(self), A {}); for (std::size_t i = 0; i < size; ++i) mem[i] = bool(buffer[i]); } // store_aligned template XSIMD_INLINE void store_aligned(T_out* mem, batch const& self, requires_arch) noexcept { static_assert(!std::is_same::value, "there should be a direct store for this type combination"); alignas(A::alignment()) T_in buffer[batch::size]; store_aligned(&buffer[0], self); std::copy(std::begin(buffer), std::end(buffer), mem); } // store_unaligned template XSIMD_INLINE void store_unaligned(T_out* mem, batch const& self, requires_arch) noexcept { static_assert(!std::is_same::value, "there should be a direct store for this type combination"); return store_aligned(mem, self, generic {}); } // swizzle template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch_constant mask, requires_arch) noexcept { return { swizzle(self.real(), mask), swizzle(self.imag(), mask) }; } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { constexpr size_t size = batch::size; alignas(A::alignment()) T self_buffer[size]; store_aligned(&self_buffer[0], self); alignas(A::alignment()) ITy mask_buffer[size]; store_aligned(&mask_buffer[0], mask); alignas(A::alignment()) T out_buffer[size]; for (size_t i = 0; i < size; ++i) out_buffer[i] = self_buffer[mask_buffer[i]]; return batch::load_aligned(out_buffer); } template XSIMD_INLINE batch, A> swizzle(batch, A> const& self, batch mask, requires_arch) noexcept { return { swizzle(self.real(), mask), swizzle(self.imag(), mask) }; } // load_complex_aligned namespace detail { template XSIMD_INLINE batch, A> load_complex(batch const& /*hi*/, batch const& /*lo*/, requires_arch) noexcept { static_assert(std::is_same::value, "load_complex not implemented for the required architecture"); } template XSIMD_INLINE batch complex_high(batch, A> const& /*src*/, requires_arch) noexcept { static_assert(std::is_same::value, "complex_high not implemented for the required architecture"); } template XSIMD_INLINE batch complex_low(batch, A> const& /*src*/, requires_arch) noexcept { static_assert(std::is_same::value, "complex_low not implemented for the required architecture"); } } template XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept { using real_batch = batch; T_in const* buffer = reinterpret_cast(mem); real_batch hi = real_batch::load_aligned(buffer), lo = real_batch::load_aligned(buffer + real_batch::size); return detail::load_complex(hi, lo, A {}); } // load_complex_unaligned template XSIMD_INLINE batch, A> load_complex_unaligned(std::complex const* mem, convert>, requires_arch) noexcept { using real_batch = batch; T_in const* buffer = reinterpret_cast(mem); real_batch hi = real_batch::load_unaligned(buffer), lo = real_batch::load_unaligned(buffer + real_batch::size); return detail::load_complex(hi, lo, A {}); } // store_complex_aligned template XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { using real_batch = batch; real_batch hi = detail::complex_high(src, A {}); real_batch lo = detail::complex_low(src, A {}); T_out* buffer = reinterpret_cast(dst); lo.store_aligned(buffer); hi.store_aligned(buffer + real_batch::size); } // store_complex_unaligned template XSIMD_INLINE void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept { using real_batch = batch; real_batch hi = detail::complex_high(src, A {}); real_batch lo = detail::complex_low(src, A {}); T_out* buffer = reinterpret_cast(dst); lo.store_unaligned(buffer); hi.store_unaligned(buffer + real_batch::size); } // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; alignas(A::alignment()) T scratch_buffer[batch::size * batch::size]; for (size_t i = 0; i < batch::size; ++i) { matrix_begin[i].store_aligned(&scratch_buffer[i * batch::size]); } // FIXME: this is super naive we can probably do better. for (size_t i = 0; i < batch::size; ++i) { for (size_t j = 0; j < i; ++j) { std::swap(scratch_buffer[i * batch::size + j], scratch_buffer[j * batch::size + i]); } } for (size_t i = 0; i < batch::size; ++i) { matrix_begin[i] = batch::load_aligned(&scratch_buffer[i * batch::size]); } } // transpose template ::size == 8, void>::type> XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto l0 = zip_lo(matrix_begin[0], matrix_begin[1]); auto l1 = zip_lo(matrix_begin[2], matrix_begin[3]); auto l2 = zip_lo(matrix_begin[4], matrix_begin[5]); auto l3 = zip_lo(matrix_begin[6], matrix_begin[7]); auto l4 = zip_lo(bit_cast>(l0), bit_cast>(l1)); auto l5 = zip_lo(bit_cast>(l2), bit_cast>(l3)); auto l6 = zip_hi(bit_cast>(l0), bit_cast>(l1)); auto l7 = zip_hi(bit_cast>(l2), bit_cast>(l3)); auto h0 = zip_hi(matrix_begin[0], matrix_begin[1]); auto h1 = zip_hi(matrix_begin[2], matrix_begin[3]); auto h2 = zip_hi(matrix_begin[4], matrix_begin[5]); auto h3 = zip_hi(matrix_begin[6], matrix_begin[7]); auto h4 = zip_lo(bit_cast>(h0), bit_cast>(h1)); auto h5 = zip_lo(bit_cast>(h2), bit_cast>(h3)); auto h6 = zip_hi(bit_cast>(h0), bit_cast>(h1)); auto h7 = zip_hi(bit_cast>(h2), bit_cast>(h3)); matrix_begin[0] = bit_cast>(zip_lo(bit_cast>(l4), bit_cast>(l5))); matrix_begin[1] = bit_cast>(zip_hi(bit_cast>(l4), bit_cast>(l5))); matrix_begin[2] = bit_cast>(zip_lo(bit_cast>(l6), bit_cast>(l7))); matrix_begin[3] = bit_cast>(zip_hi(bit_cast>(l6), bit_cast>(l7))); matrix_begin[4] = bit_cast>(zip_lo(bit_cast>(h4), bit_cast>(h5))); matrix_begin[5] = bit_cast>(zip_hi(bit_cast>(h4), bit_cast>(h5))); matrix_begin[6] = bit_cast>(zip_lo(bit_cast>(h6), bit_cast>(h7))); matrix_begin[7] = bit_cast>(zip_hi(bit_cast>(h6), bit_cast>(h7))); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template ::size == 16, void>::type> XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto l0 = zip_lo(matrix_begin[0], matrix_begin[1]); auto l1 = zip_lo(matrix_begin[2], matrix_begin[3]); auto l2 = zip_lo(matrix_begin[4], matrix_begin[5]); auto l3 = zip_lo(matrix_begin[6], matrix_begin[7]); auto l4 = zip_lo(matrix_begin[8], matrix_begin[9]); auto l5 = zip_lo(matrix_begin[10], matrix_begin[11]); auto l6 = zip_lo(matrix_begin[12], matrix_begin[13]); auto l7 = zip_lo(matrix_begin[14], matrix_begin[15]); auto h0 = zip_hi(matrix_begin[0], matrix_begin[1]); auto h1 = zip_hi(matrix_begin[2], matrix_begin[3]); auto h2 = zip_hi(matrix_begin[4], matrix_begin[5]); auto h3 = zip_hi(matrix_begin[6], matrix_begin[7]); auto h4 = zip_hi(matrix_begin[8], matrix_begin[9]); auto h5 = zip_hi(matrix_begin[10], matrix_begin[11]); auto h6 = zip_hi(matrix_begin[12], matrix_begin[13]); auto h7 = zip_hi(matrix_begin[14], matrix_begin[15]); auto L0 = zip_lo(bit_cast>(l0), bit_cast>(l1)); auto L1 = zip_lo(bit_cast>(l2), bit_cast>(l3)); auto L2 = zip_lo(bit_cast>(l4), bit_cast>(l5)); auto L3 = zip_lo(bit_cast>(l6), bit_cast>(l7)); auto m0 = zip_lo(bit_cast>(L0), bit_cast>(L1)); auto m1 = zip_lo(bit_cast>(L2), bit_cast>(L3)); auto m2 = zip_hi(bit_cast>(L0), bit_cast>(L1)); auto m3 = zip_hi(bit_cast>(L2), bit_cast>(L3)); matrix_begin[0] = bit_cast>(zip_lo(bit_cast>(m0), bit_cast>(m1))); matrix_begin[1] = bit_cast>(zip_hi(bit_cast>(m0), bit_cast>(m1))); matrix_begin[2] = bit_cast>(zip_lo(bit_cast>(m2), bit_cast>(m3))); matrix_begin[3] = bit_cast>(zip_hi(bit_cast>(m2), bit_cast>(m3))); auto L4 = zip_hi(bit_cast>(l0), bit_cast>(l1)); auto L5 = zip_hi(bit_cast>(l2), bit_cast>(l3)); auto L6 = zip_hi(bit_cast>(l4), bit_cast>(l5)); auto L7 = zip_hi(bit_cast>(l6), bit_cast>(l7)); auto m4 = zip_lo(bit_cast>(L4), bit_cast>(L5)); auto m5 = zip_lo(bit_cast>(L6), bit_cast>(L7)); auto m6 = zip_hi(bit_cast>(L4), bit_cast>(L5)); auto m7 = zip_hi(bit_cast>(L6), bit_cast>(L7)); matrix_begin[4] = bit_cast>(zip_lo(bit_cast>(m4), bit_cast>(m5))); matrix_begin[5] = bit_cast>(zip_hi(bit_cast>(m4), bit_cast>(m5))); matrix_begin[6] = bit_cast>(zip_lo(bit_cast>(m6), bit_cast>(m7))); matrix_begin[7] = bit_cast>(zip_hi(bit_cast>(m6), bit_cast>(m7))); auto H0 = zip_lo(bit_cast>(h0), bit_cast>(h1)); auto H1 = zip_lo(bit_cast>(h2), bit_cast>(h3)); auto H2 = zip_lo(bit_cast>(h4), bit_cast>(h5)); auto H3 = zip_lo(bit_cast>(h6), bit_cast>(h7)); auto M0 = zip_lo(bit_cast>(H0), bit_cast>(H1)); auto M1 = zip_lo(bit_cast>(H2), bit_cast>(H3)); auto M2 = zip_hi(bit_cast>(H0), bit_cast>(H1)); auto M3 = zip_hi(bit_cast>(H2), bit_cast>(H3)); matrix_begin[8] = bit_cast>(zip_lo(bit_cast>(M0), bit_cast>(M1))); matrix_begin[9] = bit_cast>(zip_hi(bit_cast>(M0), bit_cast>(M1))); matrix_begin[10] = bit_cast>(zip_lo(bit_cast>(M2), bit_cast>(M3))); matrix_begin[11] = bit_cast>(zip_hi(bit_cast>(M2), bit_cast>(M3))); auto H4 = zip_hi(bit_cast>(h0), bit_cast>(h1)); auto H5 = zip_hi(bit_cast>(h2), bit_cast>(h3)); auto H6 = zip_hi(bit_cast>(h4), bit_cast>(h5)); auto H7 = zip_hi(bit_cast>(h6), bit_cast>(h7)); auto M4 = zip_lo(bit_cast>(H4), bit_cast>(H5)); auto M5 = zip_lo(bit_cast>(H6), bit_cast>(H7)); auto M6 = zip_hi(bit_cast>(H4), bit_cast>(H5)); auto M7 = zip_hi(bit_cast>(H6), bit_cast>(H7)); matrix_begin[12] = bit_cast>(zip_lo(bit_cast>(M4), bit_cast>(M5))); matrix_begin[13] = bit_cast>(zip_hi(bit_cast>(M4), bit_cast>(M5))); matrix_begin[14] = bit_cast>(zip_lo(bit_cast>(M6), bit_cast>(M7))); matrix_begin[15] = bit_cast>(zip_hi(bit_cast>(M6), bit_cast>(M7))); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } } } #endif xsimd-13.2.0/include/xsimd/arch/generic/xsimd_generic_rounding.hpp000066400000000000000000000053351475736624100253040ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_ROUNDING_HPP #define XSIMD_GENERIC_ROUNDING_HPP #include "./xsimd_generic_details.hpp" namespace xsimd { namespace kernel { using namespace types; // ceil template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { batch truncated_self = trunc(self); return select(truncated_self < self, truncated_self + 1, truncated_self); } // floor template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { batch truncated_self = trunc(self); return select(truncated_self > self, truncated_self - 1, truncated_self); } // round template XSIMD_INLINE batch round(batch const& self, requires_arch) noexcept { auto v = abs(self); auto c = ceil(v); auto cp = select(c - 0.5 > v, c - 1, c); return select(v > constants::maxflint>(), self, copysign(cp, self)); } // trunc template ::value, void>::type> XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return self; } template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); } template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return select(abs(self) < constants::maxflint>(), to_float(to_int(self)), self); } } } #endif xsimd-13.2.0/include/xsimd/arch/generic/xsimd_generic_trigo.hpp000066400000000000000000001300111475736624100245710ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_TRIGO_HPP #define XSIMD_GENERIC_TRIGO_HPP #include "./xsimd_generic_details.hpp" #include namespace xsimd { namespace kernel { /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ using namespace types; // acos template XSIMD_INLINE batch acos(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto x_larger_05 = x > batch_type(0.5); x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self); x = asin(x); x = select(x_larger_05, x + x, x); x = select(self < batch_type(-0.5), constants::pi() - x, x); return select(x_larger_05, x, constants::pio2() - x); } template XSIMD_INLINE batch, A> acos(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; batch_type tmp = asin(z); return { constants::pio2() - tmp.real(), -tmp.imag() }; } // acosh /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch acosh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = self - batch_type(1.); auto test = x > constants::oneotwoeps(); batch_type z = select(test, self, x + sqrt(x + x + x * x)); batch_type l1pz = log1p(z); return select(test, l1pz + constants::log_2(), l1pz); } template XSIMD_INLINE batch, A> acosh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = acos(z); w = batch_type(-w.imag(), w.real()); return w; } // asin template XSIMD_INLINE batch asin(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type sign = bitofsign(self); auto x_larger_05 = x > batch_type(0.5); batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x); x = select(x_larger_05, sqrt(z), x); batch_type z1 = detail::horner(z); z1 = fma(z1, z * x, x); z = select(x_larger_05, constants::pio2() - (z1 + z1), z1); return z ^ sign; } template XSIMD_INLINE batch asin(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto small_cond = x < constants::sqrteps(); batch_type ct1 = batch_type(bit_cast(int64_t(0x3fe4000000000000))); batch_type zz1 = batch_type(1.) - x; batch_type vp = zz1 * detail::horner(zz1) / detail::horner1(zz1); zz1 = sqrt(zz1 + zz1); batch_type z = constants::pio4() - zz1; zz1 = fms(zz1, vp, constants::pio_2lo()); z = z - zz1; zz1 = z + constants::pio4(); batch_type zz2 = self * self; z = zz2 * detail::horner(zz2) / detail::horner1(zz2); zz2 = fma(x, z, x); return select(x > batch_type(1.), constants::nan(), select(small_cond, x, select(x > ct1, zz1, zz2)) ^ bitofsign(self)); } template XSIMD_INLINE batch, A> asin(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch x = z.real(); real_batch y = z.imag(); batch_type ct(-y, x); batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y); zz = log(ct + sqrt(zz)); batch_type resg(zz.imag(), -zz.real()); return select(y == real_batch(0.), select(fabs(x) > real_batch(1.), batch_type(constants::pio2(), real_batch(0.)), batch_type(asin(x), real_batch(0.))), resg); } // asinh /* origin: boost/simd/arch/common/simd/function/asinh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ namespace detail { template ::value, void>::type> XSIMD_INLINE batch average(const batch& x1, const batch& x2) noexcept { return (x1 & x2) + ((x1 ^ x2) >> 1); } template XSIMD_INLINE batch averagef(const batch& x1, const batch& x2) noexcept { using batch_type = batch; return fma(x1, batch_type(0.5), x2 * batch_type(0.5)); } template XSIMD_INLINE batch average(batch const& x1, batch const& x2) noexcept { return averagef(x1, x2); } template XSIMD_INLINE batch average(batch const& x1, batch const& x2) noexcept { return averagef(x1, x2); } } template XSIMD_INLINE batch asinh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto lthalf = x < batch_type(0.5); batch_type x2 = x * x; batch_type bts = bitofsign(self); batch_type z(0.); if (any(lthalf)) { z = detail::horner(x2) * x; if (all(lthalf)) return z ^ bts; } batch_type tmp = select(x > constants::oneosqrteps(), x, detail::average(x, hypot(batch_type(1.), x))); #ifndef XSIMD_NO_NANS return select(isnan(self), constants::nan(), select(lthalf, z, log(tmp) + constants::log_2()) ^ bts); #else return select(lthalf, z, log(tmp) + constants::log_2()) ^ bts; #endif } template XSIMD_INLINE batch asinh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto test = x > constants::oneosqrteps(); batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x))); #ifndef XSIMD_NO_INFINITIES z = select(x == constants::infinity(), x, z); #endif batch_type l1pz = log1p(z); z = select(test, l1pz + constants::log_2(), l1pz); return bitofsign(self) ^ z; } template XSIMD_INLINE batch, A> asinh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = asin(batch_type(-z.imag(), z.real())); w = batch_type(w.imag(), -w.real()); return w; } // atan namespace detail { template static XSIMD_INLINE batch kernel_atan(const batch& x, const batch& recx) noexcept { using batch_type = batch; const auto flag1 = x < constants::tan3pio8(); const auto flag2 = (x >= batch_type(bit_cast((uint32_t)0x3ed413cd))) && flag1; batch_type yy = select(flag1, batch_type(0.), constants::pio2()); yy = select(flag2, constants::pio4(), yy); batch_type xx = select(flag1, x, -recx); xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); const batch_type z = xx * xx; batch_type z1 = detail::horner(z); z1 = fma(xx, z1 * z, xx); z1 = select(flag2, z1 + constants::pio_4lo(), z1); z1 = select(!flag1, z1 + constants::pio_2lo(), z1); return yy + z1; } template static XSIMD_INLINE batch kernel_atan(const batch& x, const batch& recx) noexcept { using batch_type = batch; const auto flag1 = x < constants::tan3pio8(); const auto flag2 = (x >= constants::tanpio8()) && flag1; batch_type yy = select(flag1, batch_type(0.), constants::pio2()); yy = select(flag2, constants::pio4(), yy); batch_type xx = select(flag1, x, -recx); xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx); batch_type z = xx * xx; z *= detail::horner(z) / detail::horner1(z); z = fma(xx, z, xx); z = select(flag2, z + constants::pio_4lo(), z); z = z + select(flag1, batch_type(0.), constants::pio_2lo()); return yy + z; } } template XSIMD_INLINE batch atan(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type absa = abs(self); const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa); return x ^ bitofsign(self); } template XSIMD_INLINE batch, A> atan(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch x = z.real(); real_batch y = z.imag(); real_batch x2 = x * x; real_batch one(1.); real_batch a = one - x2 - (y * y); real_batch w = 0.5 * atan2(2. * x, a); real_batch num = y + one; num = x2 + num * num; real_batch den = y - one; den = x2 + den * den; batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)), batch_type(real_batch(0.), constants::infinity()), batch_type(w, 0.25 * log(num / den))); return res; } // atanh /* origin: boost/simd/arch/common/simd/function/acosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch atanh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); batch_type t = x + x; batch_type z = batch_type(1.) - x; auto test = x < batch_type(0.5); batch_type tmp = select(test, x, t) / z; return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp))); } template XSIMD_INLINE batch, A> atanh(const batch, A>& z, requires_arch) noexcept { using batch_type = batch, A>; batch_type w = atan(batch_type(-z.imag(), z.real())); w = batch_type(w.imag(), -w.real()); return w; } // atan2 template XSIMD_INLINE batch atan2(batch const& self, batch const& other, requires_arch) noexcept { using batch_type = batch; const batch_type q = abs(self / other); const batch_type z = detail::kernel_atan(q, batch_type(1.) / q); return select(other > batch_type(0.), z, constants::pi() - z) * signnz(self); } // cos namespace detail { template XSIMD_INLINE batch quadrant(const batch& x) noexcept { return x & batch(3); } template XSIMD_INLINE batch quadrant(const batch& x) noexcept { return to_float(quadrant(to_int(x))); } template XSIMD_INLINE batch quadrant(const batch& x) noexcept { using batch_type = batch; batch_type a = x * batch_type(0.25); return (a - floor(a)) * batch_type(4.); } /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch cos_eval(const batch& z) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z); } template XSIMD_INLINE batch sin_eval(const batch& z, const batch& x) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return fma(y * z, x, x); } template static XSIMD_INLINE batch base_tancot_eval(const batch& z) noexcept { using batch_type = batch; batch_type zz = z * z; batch_type y = detail::horner(zz); return fma(y, zz * z, z); } template static XSIMD_INLINE batch tan_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, y, -batch_type(1.) / y); } template static XSIMD_INLINE batch cot_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, batch_type(1.) / y, -y); } /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template static XSIMD_INLINE batch cos_eval(const batch& z) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return batch_type(1.) - y * z; } template static XSIMD_INLINE batch sin_eval(const batch& z, const batch& x) noexcept { using batch_type = batch; batch_type y = detail::horner(z); return fma(y * z, x, x); } template static XSIMD_INLINE batch base_tancot_eval(const batch& z) noexcept { using batch_type = batch; batch_type zz = z * z; batch_type num = detail::horner(zz); batch_type den = detail::horner1(zz); return fma(z, (zz * (num / den)), z); } template static XSIMD_INLINE batch tan_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, y, -batch_type(1.) / y); } template static XSIMD_INLINE batch cot_eval(const batch& z, const BB& test) noexcept { using batch_type = batch; batch_type y = base_tancot_eval(z); return select(test, batch_type(1.) / y, -y); } /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ struct trigo_radian_tag { }; struct trigo_pi_tag { }; template struct trigo_reducer { static XSIMD_INLINE B reduce(const B& x, B& xr) noexcept { if (all(x <= constants::pio4())) { xr = x; return B(0.); } else if (all(x <= constants::pio2())) { auto test = x > constants::pio4(); xr = x - constants::pio2_1(); xr -= constants::pio2_2(); xr -= constants::pio2_3(); xr = select(test, xr, x); return select(test, B(1.), B(0.)); } else if (all(x <= constants::twentypi())) { B xi = nearbyint(x * constants::twoopi()); xr = fnma(xi, constants::pio2_1(), x); xr -= xi * constants::pio2_2(); xr -= xi * constants::pio2_3(); return quadrant(xi); } else if (all(x <= constants::mediumpi())) { B fn = nearbyint(x * constants::twoopi()); B r = x - fn * constants::pio2_1(); B w = fn * constants::pio2_1t(); B t = r; w = fn * constants::pio2_2(); r = t - w; w = fn * constants::pio2_2t() - ((t - r) - w); t = r; w = fn * constants::pio2_3(); r = t - w; w = fn * constants::pio2_3t() - ((t - r) - w); xr = r - w; return quadrant(fn); } else { static constexpr std::size_t size = B::size; using value_type = typename B::value_type; alignas(B) std::array tmp; alignas(B) std::array txr; alignas(B) std::array args; x.store_aligned(args.data()); for (std::size_t i = 0; i < size; ++i) { double arg = args[i]; if (arg == std::numeric_limits::infinity()) { tmp[i] = 0.; txr[i] = std::numeric_limits::quiet_NaN(); } else { double y[2]; std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y); tmp[i] = value_type(n & 3); txr[i] = value_type(y[0]); } } xr = B::load_aligned(&txr[0]); B res = B::load_aligned(&tmp[0]); return res; } } }; template struct trigo_reducer { static XSIMD_INLINE B reduce(const B& x, B& xr) noexcept { B xi = nearbyint(x * B(2.)); B x2 = x - xi * B(0.5); xr = x2 * constants::pi(); return quadrant(xi); } }; } template XSIMD_INLINE batch cos(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type z = xr * xr; const batch_type se = detail::sin_eval(z, xr); const batch_type ce = detail::cos_eval(z); const batch_type z1 = select(swap_bit != batch_type(0.), se, ce); return z1 ^ sign_bit; } template XSIMD_INLINE batch, A> cos(batch, A> const& z, requires_arch) noexcept { return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) }; } // cosh /* origin: boost/simd/arch/common/simd/function/cosh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch cosh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type x = abs(self); auto test1 = x > (constants::maxlog() - constants::log_2()); batch_type fac = select(test1, batch_type(0.5), batch_type(1.)); batch_type tmp = exp(x * fac); batch_type tmp1 = batch_type(0.5) * tmp; return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp)); } template XSIMD_INLINE batch, A> cosh(const batch, A>& z, requires_arch) noexcept { auto x = z.real(); auto y = z.imag(); return { cosh(x) * cos(y), sinh(x) * sin(y) }; } // sin namespace detail { template XSIMD_INLINE batch sin(batch const& self, Tag = Tag()) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type z = xr * xr; const batch_type se = detail::sin_eval(z, xr); const batch_type ce = detail::cos_eval(z); const batch_type z1 = select(swap_bit == batch_type(0.), se, ce); return z1 ^ sign_bit; } } template XSIMD_INLINE batch sin(batch const& self, requires_arch) noexcept { return detail::sin(self); } template XSIMD_INLINE batch, A> sin(batch, A> const& z, requires_arch) noexcept { return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) }; } // sincos template XSIMD_INLINE std::pair, batch> sincos(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); const batch_type z = xr * xr; const batch_type se = detail::sin_eval(z, xr); const batch_type ce = detail::cos_eval(z); auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce); auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask(), batch_type(0.)); const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce); return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit); } template XSIMD_INLINE std::pair, A>, batch, A>> sincos(batch, A> const& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch rcos = cos(z.real()); real_batch rsin = sin(z.real()); real_batch icosh = cosh(z.imag()); real_batch isinh = sinh(z.imag()); return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh)); } // sinh namespace detail { /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch sinh_kernel(batch const& self) noexcept { using batch_type = batch; batch_type sqr_self = self * self; return detail::horner(sqr_self) * self; } template XSIMD_INLINE batch sinh_kernel(batch const& self) noexcept { using batch_type = batch; batch_type sqrself = self * self; return fma(self, (detail::horner(sqrself) / detail::horner1(sqrself)) * sqrself, self); } } /* origin: boost/simd/arch/common/simd/function/sinh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch sinh(batch const& a, requires_arch) noexcept { using batch_type = batch; batch_type half(0.5); batch_type x = abs(a); auto lt1 = x < batch_type(1.); batch_type bts = bitofsign(a); batch_type z(0.); if (any(lt1)) { z = detail::sinh_kernel(x); if (all(lt1)) return z ^ bts; } auto test1 = x > (constants::maxlog() - constants::log_2()); batch_type fac = select(test1, half, batch_type(1.)); batch_type tmp = exp(x * fac); batch_type tmp1 = half * tmp; batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp); return select(lt1, z, r) ^ bts; } template XSIMD_INLINE batch, A> sinh(const batch, A>& z, requires_arch) noexcept { auto x = z.real(); auto y = z.imag(); return { sinh(x) * cos(y), cosh(x) * sin(y) }; } // tan template XSIMD_INLINE batch tan(batch const& self, requires_arch) noexcept { using batch_type = batch; const batch_type x = abs(self); batch_type xr = constants::nan(); const batch_type n = detail::trigo_reducer::reduce(x, xr); auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.)); auto swap_bit = fma(batch_type(-2.), tmp, n); auto test = (swap_bit == batch_type(0.)); const batch_type y = detail::tan_eval(xr, test); return y ^ bitofsign(self); } template XSIMD_INLINE batch, A> tan(batch, A> const& z, requires_arch) noexcept { using batch_type = batch, A>; using real_batch = typename batch_type::real_batch; real_batch d = cos(2 * z.real()) + cosh(2 * z.imag()); batch_type winf(constants::infinity(), constants::infinity()); real_batch wreal = sin(2 * z.real()) / d; real_batch wimag = sinh(2 * z.imag()); batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d)); return select(d == real_batch(0.), winf, wres); } // tanh namespace detail { /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template struct tanh_kernel; template struct tanh_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type tanh(const batch_type& x) noexcept { batch_type sqrx = x * x; return fma(detail::horner(sqrx) * sqrx, x, x); } static XSIMD_INLINE batch_type cotanh(const batch_type& x) noexcept { return batch_type(1.) / tanh(x); } }; template struct tanh_kernel> { using batch_type = batch; static XSIMD_INLINE batch_type tanh(const batch_type& x) noexcept { batch_type sqrx = x * x; return fma(sqrx * p(sqrx) / q(sqrx), x, x); } static XSIMD_INLINE batch_type cotanh(const batch_type& x) noexcept { batch_type sqrx = x * x; batch_type qval = q(sqrx); return qval / (x * fma(p(sqrx), sqrx, qval)); } static XSIMD_INLINE batch_type p(const batch_type& x) noexcept { return detail::horner(x); } static XSIMD_INLINE batch_type q(const batch_type& x) noexcept { return detail::horner1(x); } }; } /* origin: boost/simd/arch/common/simd/function/tanh.hpp */ /* * ==================================================== * copyright 2016 NumScale SAS * * Distributed under the Boost Software License, Version 1.0. * (See copy at http://boost.org/LICENSE_1_0.txt) * ==================================================== */ template XSIMD_INLINE batch tanh(batch const& self, requires_arch) noexcept { using batch_type = batch; batch_type one(1.); batch_type x = abs(self); auto test = x < (batch_type(5.) / batch_type(8.)); batch_type bts = bitofsign(self); batch_type z = one; if (any(test)) { z = detail::tanh_kernel::tanh(x); if (all(test)) return z ^ bts; } batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one); return select(test, z, r) ^ bts; } template XSIMD_INLINE batch, A> tanh(const batch, A>& z, requires_arch) noexcept { using real_batch = typename batch, A>::real_batch; auto x = z.real(); auto y = z.imag(); real_batch two(2); auto d = cosh(two * x) + cos(two * y); return { sinh(two * x) / d, sin(two * y) / d }; } } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx.hpp000066400000000000000000002563141475736624100211520ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX_HPP #define XSIMD_AVX_HPP #include #include #include #include "../types/xsimd_avx_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fwd template XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept; template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; namespace detail { XSIMD_INLINE void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept { low = _mm256_castsi256_si128(val); high = _mm256_extractf128_si256(val, 1); } XSIMD_INLINE void split_avx(__m256 val, __m128& low, __m128& high) noexcept { low = _mm256_castps256_ps128(val); high = _mm256_extractf128_ps(val, 1); } XSIMD_INLINE void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept { low = _mm256_castpd256_pd128(val); high = _mm256_extractf128_pd(val, 1); } XSIMD_INLINE __m256i merge_sse(__m128i low, __m128i high) noexcept { return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1); } XSIMD_INLINE __m256 merge_sse(__m128 low, __m128 high) noexcept { return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1); } XSIMD_INLINE __m256d merge_sse(__m128d low, __m128d high) noexcept { return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1); } template XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self) noexcept { __m128i self_low, self_high; split_avx(self, self_low, self_high); __m128i res_low = f(self_low); __m128i res_high = f(self_high); return merge_sse(res_low, res_high); } template XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept { __m128i self_low, self_high, other_low, other_high; split_avx(self, self_low, self_high); split_avx(other, other_low, other_high); __m128i res_low = f(self_low, other_low); __m128i res_high = f(self_high, other_high); return merge_sse(res_low, res_high); } template XSIMD_INLINE __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept { __m128i self_low, self_high; split_avx(self, self_low, self_high); __m128i res_low = f(self_low, other); __m128i res_high = f(self_high, other); return merge_sse(res_low, res_high); } } // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31 return _mm256_andnot_ps(sign_mask, self); } template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31 return _mm256_andnot_pd(sign_mask, self); } // add template ::value, void>::type> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return add(batch(s), batch(o)); }, self, other); } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_add_ps(self, other); } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_add_pd(self, other); } // all template XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm256_testc_ps(self, batch_bool(true)) != 0; } template XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm256_testc_pd(self, batch_bool(true)) != 0; } template ::value, void>::type> XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm256_testc_si256(self, batch_bool(true)) != 0; } // any template XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return !_mm256_testz_ps(self, self); } template XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return !_mm256_testz_pd(self, self); } template ::value, void>::type> XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return !_mm256_testz_si256(self, self); } // batch_bool_cast template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return { bitwise_cast(batch(self.data)).data }; } // bitwise_and template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_and_ps(self, other); } template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_and_pd(self, other); } template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_and_ps(self, other); } template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_and_pd(self, other); } template ::value, void>::type> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_and(batch(s), batch(o)); }, self, other); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_and(batch(s), batch(o)); }, self, other); } // bitwise_andnot template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_andnot_ps(other, self); } template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_andnot_pd(other, self); } template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_andnot_ps(other, self); } template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_andnot_pd(other, self); } template ::value, void>::type> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_andnot(batch(s), batch(o)); }, self, other); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_andnot(batch(s), batch(o)); }, self, other); } // bitwise_lshift template ::value, void>::type> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept { return bitwise_lshift(batch(s), o, sse4_2 {}); }, self, other); } // bitwise_not template ::value, void>::type> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s) noexcept { return bitwise_not(batch(s), sse4_2 {}); }, self); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s) noexcept { return bitwise_not(batch_bool(s), sse4_2 {}); }, self); } // bitwise_or template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_or_ps(self, other); } template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_or_pd(self, other); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_or_ps(self, other); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_or_pd(self, other); } template ::value, void>::type> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_or(batch(s), batch(o)); }, self, other); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_or(batch_bool(s), batch_bool(o)); }, self, other); } // bitwise_rshift template ::value, void>::type> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept { return bitwise_rshift(batch(s), o, sse4_2 {}); }, self, other); } // bitwise_xor template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_xor_ps(self, other); } template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_xor_pd(self, other); } template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_ps(self, other); } template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_pd(self, other); } template ::value, void>::type> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_xor(batch(s), batch(o), sse4_2 {}); }, self, other); } template ::value, void>::type> XSIMD_INLINE batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_xor(batch_bool(s), batch_bool(o), sse4_2 {}); }, self, other); } // bitwise_cast template ::value, void>::type> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castsi256_ps(self); } template ::value, void>::type> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castsi256_pd(self); } template ::type>::value, void>::type> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return batch(self.data); } template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castps_pd(self); } template ::value, void>::type> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castps_si256(self); } template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castpd_ps(self); } template ::value, void>::type> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_castpd_si256(self); } // bitwise_not template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); } template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); } template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1))); } template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1))); } // broadcast template ::value, void>::type> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_set1_epi8(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_set1_epi16(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_set1_epi32(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_set1_epi64x(val); } else { assert(false && "unsupported"); return {}; } } template XSIMD_INLINE batch broadcast(float val, requires_arch) noexcept { return _mm256_set1_ps(val); } template XSIMD_INLINE batch broadcast(double val, requires_arch) noexcept { return _mm256_set1_pd(val); } // ceil template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return _mm256_ceil_ps(self); } template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return _mm256_ceil_pd(self); } namespace detail { // On clang, _mm256_extractf128_ps is built upon build_shufflevector // which require index parameter to be a constant template XSIMD_INLINE B get_half_complex_f(const B& real, const B& imag) noexcept { __m128 tmp0 = _mm256_extractf128_ps(real, index); __m128 tmp1 = _mm256_extractf128_ps(imag, index); __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1); tmp0 = _mm_unpacklo_ps(tmp0, tmp1); __m256 res = real; res = _mm256_insertf128_ps(res, tmp0, 0); res = _mm256_insertf128_ps(res, tmp2, 1); return res; } template XSIMD_INLINE B get_half_complex_d(const B& real, const B& imag) noexcept { __m128d tmp0 = _mm256_extractf128_pd(real, index); __m128d tmp1 = _mm256_extractf128_pd(imag, index); __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1); tmp0 = _mm_unpacklo_pd(tmp0, tmp1); __m256d res = real; res = _mm256_insertf128_pd(res, tmp0, 0); res = _mm256_insertf128_pd(res, tmp2, 1); return res; } // complex_low template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return get_half_complex_f<0>(self.real(), self.imag()); } template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return get_half_complex_d<0>(self.real(), self.imag()); } // complex_high template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return get_half_complex_f<1>(self.real(), self.imag()); } template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return get_half_complex_d<1>(self.real(), self.imag()); } } // fast_cast namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_cvtepi32_ps(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm256_cvttps_epi32(self); } } // decr_if template ::value, void>::type> XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self + batch(mask.data); } // div template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_div_ps(self, other); } template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_div_pd(self, other); } // eq template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_EQ_OQ); } template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_EQ_OQ); } template XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self != other); } template XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self != other); } template ::value, void>::type> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return eq(batch(s), batch(o), sse4_2 {}); }, self, other); } template ::value, void>::type> XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self != other); } // floor template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return _mm256_floor_ps(self); } template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return _mm256_floor_pd(self); } // from_mask template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut32[] = { 0x0000000000000000ul, 0x00000000FFFFFFFFul, 0xFFFFFFFF00000000ul, 0xFFFFFFFFFFFFFFFFul, }; assert(!(mask & ~0xFFul) && "inbound mask"); return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6])); } template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut64[][4] = { { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, }; assert(!(mask & ~0xFul) && "inbound mask"); return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask])); } template ::value, void>::type> XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint32_t lut32[] = { 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF, }; alignas(A::alignment()) static const uint64_t lut64[] = { 0x0000000000000000ul, 0x000000000000FFFFul, 0x00000000FFFF0000ul, 0x00000000FFFFFFFFul, 0x0000FFFF00000000ul, 0x0000FFFF0000FFFFul, 0x0000FFFFFFFF0000ul, 0x0000FFFFFFFFFFFFul, 0xFFFF000000000000ul, 0xFFFF00000000FFFFul, 0xFFFF0000FFFF0000ul, 0xFFFF0000FFFFFFFFul, 0xFFFFFFFF00000000ul, 0xFFFFFFFF0000FFFFul, 0xFFFFFFFFFFFF0000ul, 0xFFFFFFFFFFFFFFFFul, }; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(!(mask & ~0xFFFFFFFFul) && "inbound mask"); return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF], lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF], lut32[(mask >> 24) & 0xF], lut32[mask >> 28]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(!(mask & ~0xFFFFul) && "inbound mask"); return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_castps_si256(from_mask(batch_bool {}, mask, avx {})); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_castpd_si256(from_mask(batch_bool {}, mask, avx {})); } } // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { // row = (a,b,c,d,e,f,g,h) // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7) __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]); // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7) __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]); // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7) tmp1 = _mm256_hadd_ps(tmp0, tmp1); // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7) tmp0 = _mm256_hadd_ps(row[4], row[5]); // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7) __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]); // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3, // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) tmp2 = _mm256_hadd_ps(tmp0, tmp2); // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3, // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7) tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000); // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7, // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3) tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21); return _mm256_add_ps(tmp0, tmp1); } template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { // row = (a,b,c,d) // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3) __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]); // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3) __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]); // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3) __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100); // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3) tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21); return _mm256_add_pd(tmp1, tmp2); } // incr_if template ::value, void>::type> XSIMD_INLINE batch incr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self - batch(mask.data); } // insert template ::value, void>::type> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { #if !defined(_MSC_VER) || _MSC_VER > 1900 XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_insert_epi8(self, val, I); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_insert_epi16(self, val, I); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_insert_epi32(self, val, I); } else { return insert(self, val, pos, generic {}); } #endif return insert(self, val, pos, generic {}); } // isnan template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm256_cmp_ps(self, self, _CMP_UNORD_Q); } template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm256_cmp_pd(self, self, _CMP_UNORD_Q); } // le template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_LE_OQ); } template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_LE_OQ); } // load_aligned template ::value, void>::type> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept { return _mm256_load_si256((__m256i const*)mem); } template XSIMD_INLINE batch load_aligned(float const* mem, convert, requires_arch) noexcept { return _mm256_load_ps(mem); } template XSIMD_INLINE batch load_aligned(double const* mem, convert, requires_arch) noexcept { return _mm256_load_pd(mem); } namespace detail { // load_complex template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; __m128 tmp0 = _mm256_extractf128_ps(hi, 0); __m128 tmp1 = _mm256_extractf128_ps(hi, 1); __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); batch_type real = _mm256_castps128_ps256(tmp_real); batch_type imag = _mm256_castps128_ps256(tmp_imag); tmp0 = _mm256_extractf128_ps(lo, 0); tmp1 = _mm256_extractf128_ps(lo, 1); tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0)); tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1)); real = _mm256_insertf128_ps(real, tmp_real, 1); imag = _mm256_insertf128_ps(imag, tmp_imag, 1); return { real, imag }; } template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; __m128d tmp0 = _mm256_extractf128_pd(hi, 0); __m128d tmp1 = _mm256_extractf128_pd(hi, 1); batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1)); batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1)); tmp0 = _mm256_extractf128_pd(lo, 0); tmp1 = _mm256_extractf128_pd(lo, 1); __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1); __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1); real = _mm256_blend_pd(real, re_tmp1, 12); imag = _mm256_blend_pd(imag, im_tmp1, 12); return { real, imag }; } } // load_unaligned template ::value, void>::type> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return _mm256_loadu_si256((__m256i const*)mem); } template XSIMD_INLINE batch load_unaligned(float const* mem, convert, requires_arch) noexcept { return _mm256_loadu_ps(mem); } template XSIMD_INLINE batch load_unaligned(double const* mem, convert, requires_arch) noexcept { return _mm256_loadu_pd(mem); } // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_LT_OQ); } template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_LT_OQ); } template ::value, void>::type> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return lt(batch(s), batch(o)); }, self, other); } // mask template ::value, void>::type> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) { __m128i self_low, self_high; detail::split_avx(self, self_low, self_high); return mask(batch_bool(self_low), sse4_2 {}) | (mask(batch_bool(self_high), sse4_2 {}) << (128 / (8 * sizeof(T)))); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_movemask_ps(_mm256_castsi256_ps(self)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_movemask_pd(_mm256_castsi256_pd(self)); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return _mm256_movemask_ps(self); } template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return _mm256_movemask_pd(self); } // max template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_max_ps(self, other); } template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_max_pd(self, other); } template ::value, void>::type> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return select(self > other, self, other); } // min template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_min_ps(self, other); } template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_min_pd(self, other); } template ::value, void>::type> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return select(self <= other, self, other); } // mul template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_mul_ps(self, other); } template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_mul_pd(self, other); } // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT); } template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT); } // nearbyint_as_int template XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return _mm256_cvtps_epi32(self); } // neg template ::value, void>::type> XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return 0 - self; } template batch neg(batch const& self, requires_arch) { return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000))); } template XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000))); } // neq template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ); } template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ); } template ::value, void>::type> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~(self == other); } template XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_ps(self, other); } template XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_pd(self, other); } template ::value, void>::type> XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data))); } // reciprocal template XSIMD_INLINE batch reciprocal(batch const& self, kernel::requires_arch) noexcept { return _mm256_rcp_ps(self); } // reduce_add template XSIMD_INLINE float reduce_add(batch const& rhs, requires_arch) noexcept { // Warning about _mm256_hadd_ps: // _mm256_hadd_ps(a,b) gives // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't // rely on a naive use of this method // rhs = (x0, x1, x2, x3, x4, x5, x6, x7) // tmp = (x4, x5, x6, x7, x0, x1, x2, x3) __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1); // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7) tmp = _mm256_add_ps(rhs, tmp); // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -) tmp = _mm256_hadd_ps(tmp, tmp); // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -) tmp = _mm256_hadd_ps(tmp, tmp); return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0)); } template XSIMD_INLINE double reduce_add(batch const& rhs, requires_arch) noexcept { // rhs = (x0, x1, x2, x3) // tmp = (x2, x3, x0, x1) __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1); // tmp = (x2+x0, x3+x1, -, -) tmp = _mm256_add_pd(rhs, tmp); // tmp = (x2+x0+x3+x1, -, -, -) tmp = _mm256_hadd_pd(tmp, tmp); return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0)); } template ::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { __m128i low, high; detail::split_avx(self, low, high); batch blow(low), bhigh(high); return reduce_add(blow) + reduce_add(bhigh); } // reduce_max template ::type> XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept { constexpr auto mask = detail::shuffle(1, 0); batch step = _mm256_permute2f128_si256(self, self, mask); batch acc = max(self, step); __m128i low = _mm256_castsi256_si128(acc); return reduce_max(batch(low)); } // reduce_min template ::type> XSIMD_INLINE T reduce_min(batch const& self, requires_arch) noexcept { constexpr auto mask = detail::shuffle(1, 0); batch step = _mm256_permute2f128_si256(self, self, mask); batch acc = min(self, step); __m128i low = _mm256_castsi256_si128(acc); return reduce_min(batch(low)); } // rsqrt template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm256_rsqrt_ps(val); } template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val))); } // sadd template ::value, void>::type> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { auto mask = (other >> (8 * sizeof(T) - 1)); auto self_pos_branch = min(std::numeric_limits::max() - other, self); auto self_neg_branch = max(std::numeric_limits::min() - other, self); return other + select(batch_bool(mask.data), self_neg_branch, self_pos_branch); } else { const auto diffmax = std::numeric_limits::max() - self; const auto mindiff = min(diffmax, other); return self + mindiff; } } // select template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm256_blendv_ps(false_br, true_br, cond); } template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm256_blendv_pd(false_br, true_br, cond); } template ::value, void>::type> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { __m128i cond_low, cond_hi; detail::split_avx(cond, cond_low, cond_hi); __m128i true_low, true_hi; detail::split_avx(true_br, true_low, true_hi); __m128i false_low, false_hi; detail::split_avx(false_br, false_low, false_hi); __m128i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), sse4_2 {}); __m128i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), sse4_2 {}); return detail::merge_sse(res_low, res_hi); } template ::value, void>::type> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, avx2 {}); } template XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { constexpr auto mask = batch_bool_constant::mask(); return _mm256_blend_ps(false_br, true_br, mask); } template XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { constexpr auto mask = batch_bool_constant::mask(); return _mm256_blend_pd(false_br, true_br, mask); } // set template XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return _mm256_setr_ps(values...); } template XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return _mm256_setr_pd(values...); } template ::value, void>::type> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept { return _mm256_set_epi64x(v3, v2, v1, v0); } template ::value, void>::type> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept { return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7); } template ::value, void>::type> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept { return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template ::value, void>::type> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept { return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); } template ::value, void>::type> XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return _mm256_castsi256_ps(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return _mm256_castsi256_pd(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); } // shuffle template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3); // shuffle within lane if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I0 < 4 && I1 < 4 && I2 >= 8 && I2 < 12 && I3 >= 8 && I3 < 12) return _mm256_shuffle_ps(x, y, smask); // shuffle within opposite lane if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I2 < 4 && I3 < 4 && I0 >= 8 && I0 < 12 && I1 >= 8 && I1 < 12) return _mm256_shuffle_ps(y, x, smask); return shuffle(x, y, mask, generic {}); } template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3); // shuffle within lane if (I0 < 2 && I1 >= 4 && I1 < 6 && I2 >= 2 && I2 < 4 && I3 >= 6) return _mm256_shuffle_pd(x, y, smask); // shuffle within opposite lane if (I1 < 2 && I0 >= 4 && I0 < 6 && I3 >= 2 && I3 < 4 && I2 >= 6) return _mm256_shuffle_pd(y, x, smask); return shuffle(x, y, mask, generic {}); } // slide_left template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; __m128i low = _mm256_castsi256_si128(x); auto y = _mm_slli_si128(low, M); __m256i zero = _mm256_setzero_si256(); return _mm256_insertf128_si256(zero, y, 1); } if (BitCount == 128) { __m128i low = _mm256_castsi256_si128(x); __m256i zero = _mm256_setzero_si256(); return _mm256_insertf128_si256(zero, low, 1); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; __m128i low = _mm256_castsi256_si128(x); auto ylow = _mm_slli_si128(low, M); auto zlow = _mm_srli_si128(low, 16 - M); __m128i high = _mm256_extractf128_si256(x, 1); auto yhigh = _mm_slli_si128(high, M); __m256i res = _mm256_castsi128_si256(ylow); return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1); } // slide_right template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; __m128i high = _mm256_extractf128_si256(x, 1); __m128i y = _mm_srli_si128(high, M); __m256i zero = _mm256_setzero_si256(); return _mm256_insertf128_si256(zero, y, 0); } if (BitCount == 128) { __m128i high = _mm256_extractf128_si256(x, 1); return _mm256_castsi128_si256(high); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; __m128i low = _mm256_castsi256_si128(x); auto ylow = _mm_srli_si128(low, M); __m128i high = _mm256_extractf128_si256(x, 1); auto yhigh = _mm_srli_si128(high, M); auto zhigh = _mm_slli_si128(high, 16 - M); __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh)); return _mm256_insertf128_si256(res, yhigh, 1); } // sqrt template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm256_sqrt_ps(val); } template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm256_sqrt_pd(val); } // ssub template ::value, void>::type> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { return sadd(self, -other); } else { const auto diff = min(self, other); return self - diff; } } // store_aligned template ::value, void>::type> XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept { return _mm256_store_si256((__m256i*)mem, self); } template ::value, void>::type> XSIMD_INLINE void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm256_store_si256((__m256i*)mem, self); } template XSIMD_INLINE void store_aligned(float* mem, batch const& self, requires_arch) noexcept { return _mm256_store_ps(mem, self); } template XSIMD_INLINE void store_aligned(double* mem, batch const& self, requires_arch) noexcept { return _mm256_store_pd(mem, self); } // store_unaligned template ::value, void>::type> XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { return _mm256_storeu_si256((__m256i*)mem, self); } template ::value, void>::type> XSIMD_INLINE void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm256_storeu_si256((__m256i*)mem, self); } template XSIMD_INLINE void store_unaligned(float* mem, batch const& self, requires_arch) noexcept { return _mm256_storeu_ps(mem, self); } template XSIMD_INLINE void store_unaligned(double* mem, batch const& self, requires_arch) noexcept { return _mm256_storeu_pd(mem, self); } // sub template ::value, void>::type> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return sub(batch(s), batch(o)); }, self, other); } template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_sub_ps(self, other); } template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_sub_pd(self, other); } // swizzle (dynamic mask) template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { // duplicate low and high part of input __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1)); __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0); __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self)); __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1); // normalize mask batch half_mask = mask % 4; // permute within each lane __m256 r0 = _mm256_permutevar_ps(low_low, half_mask); __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask); // mask to choose the right lane batch_bool blend_mask = mask >= 4; // blend the two permutes return _mm256_blendv_ps(r0, r1, batch_bool_cast(blend_mask)); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { // duplicate low and high part of input __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1)); __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0); __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self)); __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1); // normalize mask batch half_mask = -(mask & 1); // permute within each lane __m256d r0 = _mm256_permutevar_pd(low_low, half_mask); __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask); // mask to choose the right lane batch_bool blend_mask = mask >= 2; // blend the two permutes return _mm256_blendv_pd(r0, r1, batch_bool_cast(blend_mask)); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept { return bitwise_cast( swizzle(bitwise_cast(self), mask)); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch const& mask, requires_arch) noexcept { return bitwise_cast( swizzle(bitwise_cast(self), mask)); } // swizzle (constant mask) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { // duplicate low and high part of input __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1)); __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0); __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self)); __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1); // normalize mask batch_constant half_mask; // permute within each lane __m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch()); __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch()); // mask to choose the right lane batch_bool_constant= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask; // blend the two permutes constexpr auto mask = blend_mask.mask(); return _mm256_blend_ps(r0, r1, mask); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { // duplicate low and high part of input __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1)); __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0); __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self)); __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1); // normalize mask batch_constant half_mask; // permute within each lane __m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch()); __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch()); // mask to choose the right lane batch_bool_constant= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask; // blend the two permutes constexpr auto mask = blend_mask.mask(); return _mm256_blend_pd(r0, r1, mask); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept { return bitwise_cast( swizzle(bitwise_cast(self), mask)); } template = 0> XSIMD_INLINE batch swizzle(batch const& self, batch_constant const& mask, requires_arch) noexcept { return bitwise_cast( swizzle(bitwise_cast(self), mask)); } // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; // See // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2 auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3], r4 = matrix_begin[4], r5 = matrix_begin[5], r6 = matrix_begin[6], r7 = matrix_begin[7]; auto t0 = _mm256_unpacklo_ps(r0, r1); auto t1 = _mm256_unpackhi_ps(r0, r1); auto t2 = _mm256_unpacklo_ps(r2, r3); auto t3 = _mm256_unpackhi_ps(r2, r3); auto t4 = _mm256_unpacklo_ps(r4, r5); auto t5 = _mm256_unpackhi_ps(r4, r5); auto t6 = _mm256_unpacklo_ps(r6, r7); auto t7 = _mm256_unpackhi_ps(r6, r7); r0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0)); r1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2)); r2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0)); r3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2)); r4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0)); r5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2)); r6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0)); r7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2)); matrix_begin[0] = _mm256_permute2f128_ps(r0, r4, 0x20); matrix_begin[1] = _mm256_permute2f128_ps(r1, r5, 0x20); matrix_begin[2] = _mm256_permute2f128_ps(r2, r6, 0x20); matrix_begin[3] = _mm256_permute2f128_ps(r3, r7, 0x20); matrix_begin[4] = _mm256_permute2f128_ps(r0, r4, 0x31); matrix_begin[5] = _mm256_permute2f128_ps(r1, r5, 0x31); matrix_begin[6] = _mm256_permute2f128_ps(r2, r6, 0x31); matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; auto t0 = _mm256_unpacklo_pd(r0, r1); // r00 r10 r01 r11 auto t1 = _mm256_unpackhi_pd(r0, r1); // r02 r12 r03 r13 auto t2 = _mm256_unpacklo_pd(r2, r3); // r20 r30 r21 r31 auto t3 = _mm256_unpackhi_pd(r2, r3); // r22 r32 r23 r33 matrix_begin[0] = _mm256_permute2f128_pd(t0, t2, 0x20); matrix_begin[1] = _mm256_permute2f128_pd(t1, t3, 0x20); matrix_begin[2] = _mm256_permute2f128_pd(t0, t2, 0x31); matrix_begin[3] = _mm256_permute2f128_pd(t1, t3, 0x31); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; batch tmp_lo0[8]; for (int i = 0; i < 8; ++i) tmp_lo0[i] = _mm256_castsi256_si128(matrix_begin[i]); transpose(tmp_lo0 + 0, tmp_lo0 + 8, sse4_2 {}); batch tmp_hi0[8]; for (int i = 0; i < 8; ++i) tmp_hi0[i] = _mm256_castsi256_si128(matrix_begin[8 + i]); transpose(tmp_hi0 + 0, tmp_hi0 + 8, sse4_2 {}); batch tmp_lo1[8]; for (int i = 0; i < 8; ++i) tmp_lo1[i] = _mm256_extractf128_si256(matrix_begin[i], 1); transpose(tmp_lo1 + 0, tmp_lo1 + 8, sse4_2 {}); batch tmp_hi1[8]; for (int i = 0; i < 8; ++i) tmp_hi1[i] = _mm256_extractf128_si256(matrix_begin[8 + i], 1); transpose(tmp_hi1 + 0, tmp_hi1 + 8, sse4_2 {}); for (int i = 0; i < 8; ++i) matrix_begin[i] = detail::merge_sse(tmp_lo0[i], tmp_hi0[i]); for (int i = 0; i < 8; ++i) matrix_begin[i + 8] = detail::merge_sse(tmp_lo1[i], tmp_hi1[i]); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; batch tmp_lo0[16]; for (int i = 0; i < 16; ++i) tmp_lo0[i] = _mm256_castsi256_si128(matrix_begin[i]); transpose(tmp_lo0 + 0, tmp_lo0 + 16, sse4_2 {}); batch tmp_hi0[16]; for (int i = 0; i < 16; ++i) tmp_hi0[i] = _mm256_castsi256_si128(matrix_begin[16 + i]); transpose(tmp_hi0 + 0, tmp_hi0 + 16, sse4_2 {}); batch tmp_lo1[16]; for (int i = 0; i < 16; ++i) tmp_lo1[i] = _mm256_extractf128_si256(matrix_begin[i], 1); transpose(tmp_lo1 + 0, tmp_lo1 + 16, sse4_2 {}); batch tmp_hi1[16]; for (int i = 0; i < 16; ++i) tmp_hi1[i] = _mm256_extractf128_si256(matrix_begin[16 + i], 1); transpose(tmp_hi1 + 0, tmp_hi1 + 16, sse4_2 {}); for (int i = 0; i < 16; ++i) matrix_begin[i] = detail::merge_sse(tmp_lo0[i], tmp_hi0[i]); for (int i = 0; i < 16; ++i) matrix_begin[i + 16] = detail::merge_sse(tmp_lo1[i], tmp_hi1[i]); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } // trunc template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return _mm256_round_ps(self, _MM_FROUND_TO_ZERO); } template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return _mm256_round_pd(self, _MM_FROUND_TO_ZERO); } // zip_hi template ::value, void>::type> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) { // extract high word __m128i self_hi = _mm256_extractf128_si256(self, 1); __m128i other_hi = _mm256_extractf128_si256(other, 1); // interleave __m128i res_lo, res_hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { res_lo = _mm_unpacklo_epi8(self_hi, other_hi); res_hi = _mm_unpackhi_epi8(self_hi, other_hi); } else { res_lo = _mm_unpacklo_epi16(self_hi, other_hi); res_hi = _mm_unpackhi_epi16(self_hi, other_hi); } // fuse return _mm256_castps_si256( _mm256_insertf128_ps( _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)), _mm_castsi128_ps(res_hi), 1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31)); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_ps(self, other); auto hi = _mm256_unpackhi_ps(self, other); return _mm256_permute2f128_ps(lo, hi, 0x31); } template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_pd(self, other); auto hi = _mm256_unpackhi_pd(self, other); return _mm256_permute2f128_pd(lo, hi, 0x31); } // zip_lo template ::value, void>::type> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2) { // extract low word __m128i self_lo = _mm256_extractf128_si256(self, 0); __m128i other_lo = _mm256_extractf128_si256(other, 0); // interleave __m128i res_lo, res_hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { res_lo = _mm_unpacklo_epi8(self_lo, other_lo); res_hi = _mm_unpackhi_epi8(self_lo, other_lo); } else { res_lo = _mm_unpacklo_epi16(self_lo, other_lo); res_hi = _mm_unpackhi_epi16(self_lo, other_lo); } // fuse return _mm256_castps_si256( _mm256_insertf128_ps( _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)), _mm_castsi128_ps(res_hi), 1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other)); return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other)); return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1)); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_ps(self, other); auto hi = _mm256_unpackhi_ps(self, other); return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1); } template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm256_unpacklo_pd(self, other); auto hi = _mm256_unpackhi_pd(self, other); return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1); } } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx2.hpp000066400000000000000000001244511475736624100212300ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX2_HPP #define XSIMD_AVX2_HPP #include #include #include "../types/xsimd_avx2_register.hpp" namespace xsimd { namespace kernel { using namespace types; // abs template ::value, void>::type> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_abs_epi8(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_abs_epi16(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_abs_epi32(self); } else { return abs(self, avx {}); } } return self; } // add template ::value, void>::type> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_add_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_add_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_add_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_add_epi64(self, other); } else { return add(self, other, avx {}); } } // avgr template ::value, void>::type> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_avg_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_avg_epu16(self, other); } else { return avgr(self, other, generic {}); } } // avg template ::value, void>::type> XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto adj = ((self ^ other) << 7) >> 7; return avgr(self, other, A {}) - adj; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto adj = ((self ^ other) << 15) >> 15; return avgr(self, other, A {}) - adj; } else { return avg(self, other, generic {}); } } // bitwise_and template ::value, void>::type> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_and_si256(self, other); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_and_si256(self, other); } // bitwise_andnot template ::value, void>::type> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_andnot_si256(other, self); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_andnot_si256(other, self); } // bitwise_not template ::value, void>::type> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm256_xor_si256(self, _mm256_set1_epi32(-1)); } // bitwise_lshift template ::value, void>::type> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_slli_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_slli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_slli_epi64(self, other); } else { return bitwise_lshift(self, other, avx {}); } } template ::value, void>::type> XSIMD_INLINE batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_sllv_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_sllv_epi64(self, other); } else { return bitwise_lshift(self, other, avx {}); } } // bitwise_or template ::value, void>::type> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_or_si256(self, other); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_or_si256(self, other); } // bitwise_rshift template ::value, void>::type> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF); __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self); __m256i res = _mm256_srai_epi16(self, other); return _mm256_or_si256( detail::fwd_to_sse([](__m128i s, __m128i o) noexcept { return bitwise_and(batch(s), batch(o), sse4_2 {}); }, sign_mask, cmp_is_negative), _mm256_andnot_si256(sign_mask, res)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_srai_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srai_epi32(self, other); } else { return bitwise_rshift(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_srli_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_srli_epi64(self, other); } else { return bitwise_rshift(self, other, avx {}); } } } template ::value, void>::type> XSIMD_INLINE batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srav_epi32(self, other); } else { return bitwise_rshift(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_srlv_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_srlv_epi64(self, other); } else { return bitwise_rshift(self, other, avx {}); } } } // bitwise_xor template ::value, void>::type> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm256_xor_si256(self, other); } template ::value, void>::type> XSIMD_INLINE batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm256_xor_si256(self, other); } // complex_low template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0)); __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0)); return _mm256_blend_pd(tmp0, tmp1, 10); } // complex_high template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2)); __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0)); return _mm256_blend_pd(tmp0, tmp1, 10); } // fast_cast namespace detail { template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to avx __m256i xH = _mm256_srli_epi64(x, 32); xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84 __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); } template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx // adapted to avx __m256i xH = _mm256_srai_epi32(x, 16); xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); // 3*2^67 __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52 __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 return _mm256_add_pd(f, _mm256_castsi256_pd(xL)); } } // eq template ::value, void>::type> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_cmpeq_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_cmpeq_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_cmpeq_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_cmpeq_epi64(self, other); } else { return eq(self, other, avx {}); } } // gather template = 0, detail::enable_sized_integral_t = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i32gather_epi32(reinterpret_cast(src), index, sizeof(T)); } template = 0, detail::enable_sized_integral_t = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i64gather_epi64(reinterpret_cast(src), index, sizeof(T)); } template = 0> XSIMD_INLINE batch gather(batch const&, float const* src, batch const& index, kernel::requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i32gather_ps(src, index, sizeof(float)); } template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { // scatter for this one is AVX512F+AVX512VL return _mm256_i64gather_pd(src, index, sizeof(double)); } // gather: handmade conversions template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); const batch high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data)); } template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double))); const batch high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double))); return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data)); } // lt template ::value, void>::type> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_cmpgt_epi8(other, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_cmpgt_epi16(other, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_cmpgt_epi32(other, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_cmpgt_epi64(other, self); } else { return lt(self, other, avx {}); } } else { return lt(self, other, avx {}); } } // load_complex template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; batch_type real = _mm256_castpd_ps( _mm256_permute4x64_pd( _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))), _MM_SHUFFLE(3, 1, 2, 0))); batch_type imag = _mm256_castpd_ps( _mm256_permute4x64_pd( _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))), _MM_SHUFFLE(3, 1, 2, 0))); return { real, imag }; } template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { using batch_type = batch; batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0)); return { real, imag }; } // mask template ::value, void>::type> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self); return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12); } else { return mask(self, avx {}); } } // max template ::value, void>::type> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_max_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_max_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_max_epi32(self, other); } else { return max(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_max_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_max_epu16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_max_epu32(self, other); } else { return max(self, other, avx {}); } } } // min template ::value, void>::type> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_min_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_min_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_min_epi32(self, other); } else { return min(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_min_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_min_epu16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_min_epu32(self, other); } else { return min(self, other, avx {}); } } } // mul template ::value, void>::type> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00); __m256i res_lo = _mm256_mullo_epi16(self, other); __m256i other_hi = _mm256_srli_epi16(other, 8); __m256i self_hi = _mm256_and_si256(self, mask_hi); __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi); __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi); return res; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_mullo_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_mullo_epi32(self, other); } else { return mul(self, other, avx {}); } } // reduce_add template ::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { __m256i tmp1 = _mm256_hadd_epi32(self, self); __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3); return _mm_cvtsi128_si32(tmp4); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E); __m256i tmp2 = _mm256_add_epi64(self, tmp1); __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1); __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3); #if defined(__x86_64__) return _mm_cvtsi128_si64(res); #else __m128i m; _mm_storel_epi64(&m, res); int64_t i; std::memcpy(&i, &m, sizeof(i)); return i; #endif } else { return reduce_add(self, avx {}); } } // rotate_left template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return _mm256_alignr_epi8(self, self, N); } template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return bitwise_cast(rotate_left(bitwise_cast(self), avx2 {})); } // sadd template ::value, void>::type> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_adds_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_adds_epi16(self, other); } else { return sadd(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_adds_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_adds_epu16(self, other); } else { return sadd(self, other, avx {}); } } } // select template ::value, void>::type> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_blendv_epi8(false_br, true_br, cond); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_blendv_epi8(false_br, true_br, cond); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_blendv_epi8(false_br, true_br, cond); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_blendv_epi8(false_br, true_br, cond); } else { return select(cond, true_br, false_br, avx {}); } } template ::value, void>::type> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { constexpr int mask = batch_bool_constant::mask(); // FIXME: for some reason mask here is not considered as an immediate, // but it's okay for _mm256_blend_epi32 // case 2: return _mm256_blend_epi16(false_br, true_br, mask); XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_blend_epi32(false_br, true_br, mask); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { constexpr int imask = detail::interleave(mask); return _mm256_blend_epi32(false_br, true_br, imask); } else { return select(batch_bool { Values... }, true_br, false_br, avx2 {}); } } // slide_left template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; auto y = _mm256_bslli_epi128(x, M); return _mm256_permute2x128_si256(y, y, 0x28); } if (BitCount == 128) { return _mm256_permute2x128_si256(x, x, 0x28); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; auto y = _mm256_bslli_epi128(x, M); auto z = _mm256_bsrli_epi128(x, 16 - M); auto w = _mm256_permute2x128_si256(z, z, 0x28); return _mm256_or_si256(y, w); } // slide_right template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 256) { return batch(T(0)); } if (BitCount > 128) { constexpr unsigned M = (BitCount - 128) / 8; auto y = _mm256_bsrli_epi128(x, M); return _mm256_permute2x128_si256(y, y, 0x81); } if (BitCount == 128) { return _mm256_permute2x128_si256(x, x, 0x81); } // shifting by [0, 128[ bits constexpr unsigned M = BitCount / 8; auto y = _mm256_bsrli_epi128(x, M); auto z = _mm256_bslli_epi128(x, 16 - M); auto w = _mm256_permute2x128_si256(z, z, 0x81); return _mm256_or_si256(y, w); } // ssub template ::value, void>::type> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_subs_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_subs_epi16(self, other); } else { return ssub(self, other, avx {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_subs_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_subs_epu16(self, other); } else { return ssub(self, other, avx {}); } } } // sub template ::value, void>::type> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm256_sub_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm256_sub_epi16(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm256_sub_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm256_sub_epi64(self, other); } else { return sub(self, other, avx {}); } } // swizzle (dynamic mask) template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm256_permutevar8x32_ps(self, mask); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { batch broadcaster = { 0, 1, 0, 1, 0, 1, 0, 1 }; constexpr uint64_t comb = 0x0000000100000001ul * 2; return bitwise_cast(swizzle(bitwise_cast(self), bitwise_cast(mask * comb) + broadcaster, avx2 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm256_permutevar8x32_epi32(self, mask); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); } // swizzle (constant mask) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return _mm256_permutevar8x32_ps(self, mask.as_batch()); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr auto mask = detail::shuffle(V0, V1, V2, V3); return _mm256_permute4x64_pd(self, mask); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr auto mask = detail::shuffle(V0, V1, V2, V3); return _mm256_permute4x64_epi64(self, mask); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return _mm256_permutevar8x32_epi32(self, mask.as_batch()); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx2 {})); } // zip_hi template ::value, void>::type> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto lo = _mm256_unpacklo_epi8(self, other); auto hi = _mm256_unpackhi_epi8(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto lo = _mm256_unpacklo_epi16(self, other); auto hi = _mm256_unpackhi_epi16(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_epi32(self, other); auto hi = _mm256_unpackhi_epi32(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_epi64(self, other); auto hi = _mm256_unpackhi_epi64(self, other); return _mm256_permute2f128_si256(lo, hi, 0x31); } else { assert(false && "unsupported arch/op combination"); return {}; } } // zip_lo template ::value, void>::type> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto lo = _mm256_unpacklo_epi8(self, other); auto hi = _mm256_unpackhi_epi8(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto lo = _mm256_unpacklo_epi16(self, other); auto hi = _mm256_unpackhi_epi16(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { auto lo = _mm256_unpacklo_epi32(self, other); auto hi = _mm256_unpackhi_epi32(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { auto lo = _mm256_unpacklo_epi64(self, other); auto hi = _mm256_unpackhi_epi64(self, other); return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1); } else { assert(false && "unsupported arch/op combination"); return {}; } } } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx512bw.hpp000066400000000000000000000655641475736624100217400ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512BW_HPP #define XSIMD_AVX512BW_HPP #include #include #include "../types/xsimd_avx512bw_register.hpp" namespace xsimd { namespace kernel { using namespace types; namespace detail { template XSIMD_INLINE batch_bool compare_int_avx512bw(batch const& self, batch const& other) noexcept { using register_type = typename batch_bool::register_type; if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); } } } } // abs template ::value, void>::type> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { if (std::is_unsigned::value) { return self; } XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_abs_epi8(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_abs_epi16(self); } else { return abs(self, avx512dq {}); } } // add template ::value, void>::type> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_add_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_add_epi16(self, other); } else { return add(self, other, avx512dq {}); } } // avgr template ::value, void>::type> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_avg_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_avg_epu16(self, other); } else { return avgr(self, other, generic {}); } } // avg template ::value, void>::type> XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { auto adj = ((self ^ other) << 7) >> 7; return avgr(self, other, A {}) - adj; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { auto adj = ((self ^ other) << 15) >> 15; return avgr(self, other, A {}) - adj; } else { return avg(self, other, generic {}); } } // bitwise_lshift template ::value, void>::type> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_sllv_epi16(self, _mm512_set1_epi16(other)); #else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_slli_epi16(self, other); #endif } else { return bitwise_lshift(self, other, avx512dq {}); } } // bitwise_rshift template ::value, void>::type> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF); __m512i zeros = _mm512_setzero_si512(); __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self); __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other)); #else __m512i res = _mm512_srai_epi16(self, other); #endif return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res)); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srav_epi16(self, _mm512_set1_epi16(other)); #else } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srai_epi16(self, other); #endif } else { return bitwise_rshift(self, other, avx512dq {}); } } else { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srlv_epi16(self, _mm512_set1_epi16(other)); #else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_srli_epi16(self, other); #endif } else { return bitwise_rshift(self, other, avx512dq {}); } } } // eq template ::value, void>::type> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // ge template ::value, void>::type> XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // gt template ::value, void>::type> XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // le template ::value, void>::type> XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // lt template ::value, void>::type> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // max template ::value, void>::type> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_max_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_max_epi16(self, other); } else { return max(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_max_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_max_epu16(self, other); } else { return max(self, other, avx512dq {}); } } } // min template ::value, void>::type> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_min_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_min_epi16(self, other); } else { return min(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_min_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_min_epu16(self, other); } else { return min(self, other, avx512dq {}); } } } // mul template ::value, void>::type> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8)); __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8); return _mm512_or_si512(upper, lower); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_mullo_epi16(self, other); } else { return mul(self, other, avx512dq {}); } } // neq template ::value, void>::type> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512bw(self, other); } // rotate_left template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return _mm512_alignr_epi8(self, self, N); } template XSIMD_INLINE batch rotate_left(batch const& self, requires_arch) noexcept { return bitwise_cast(rotate_left(bitwise_cast(self), avx2 {})); } // sadd template ::value, void>::type> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_adds_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_adds_epi16(self, other); } else { return sadd(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_adds_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_adds_epu16(self, other); } else { return sadd(self, other, avx512dq {}); } } } // select template ::value, void>::type> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data); } else { return select(cond, true_br, false_br, avx512dq {}); } } // slide_left namespace detail { template constexpr std::array make_slide_perm_hi(::xsimd::detail::index_sequence) { return { (Is == 0 ? 8 : Is - 1)... }; } template constexpr std::array make_slide_left_pattern(::xsimd::detail::index_sequence) { return { (Is >= N ? Is - N : 0)... }; } template constexpr std::array make_slide_left_mask(::xsimd::detail::index_sequence) { return { (Is >= N ? 0xFFFF : 0x0000)... }; } } template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 512) { return batch(T(0)); } batch xx; if (N & 1) { alignas(A::alignment()) uint64_t buffer[8]; _mm512_store_epi64(&buffer[0], x); for (int i = 7; i > 0; --i) buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56); buffer[0] = buffer[0] << 8; xx = _mm512_load_epi64(&buffer[0]); alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>()); __m512i xl = _mm512_slli_epi64(x, 8); __m512i xr = _mm512_srli_epi64(x, 56); xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512()); xx = _mm512_or_si512(xr, xl); if (N == 1) return xx; } else { xx = x; } alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern(::xsimd::detail::make_index_sequence<512 / 16>()); alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask(::xsimd::detail::make_index_sequence<512 / 16>()); return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data())); } // slide_right namespace detail { template constexpr std::array make_slide_perm_low(::xsimd::detail::index_sequence) { return { (Is + 1)... }; } template constexpr std::array make_slide_right_pattern(::xsimd::detail::index_sequence) { return { (Is < (32 - N) ? Is + N : 0)... }; } template constexpr std::array make_slide_right_mask(::xsimd::detail::index_sequence) { return { (Is < 32 - N ? 0xFFFF : 0x0000)... }; } } template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { constexpr unsigned BitCount = N * 8; if (BitCount == 0) { return x; } if (BitCount >= 512) { return batch(T(0)); } batch xx; if (N & 1) { alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>()); __m512i xr = _mm512_srli_epi64(x, 8); __m512i xl = _mm512_slli_epi64(x, 56); xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512()); xx = _mm512_or_si512(xr, xl); if (N == 1) return xx; } else { xx = x; } alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern(::xsimd::detail::make_index_sequence<512 / 16>()); alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask(::xsimd::detail::make_index_sequence<512 / 16>()); return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data())); } // ssub template ::value, void>::type> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_subs_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_subs_epi16(self, other); } else { return ssub(self, other, avx512dq {}); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_subs_epu8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_subs_epu16(self, other); } else { return ssub(self, other, avx512dq {}); } } } // sub template ::value, void>::type> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_sub_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_sub_epi16(self, other); } else { return sub(self, other, avx512dq {}); } } // swizzle (dynamic version) template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_epi16(mask, self); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512bw {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_shuffle_epi8(self, mask); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512bw {})); } // swizzle (static version) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512bw {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512bw {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512bw {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512bw {}); } // zip_hi template ::value, void>::type> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { lo = _mm512_unpacklo_epi8(self, other); hi = _mm512_unpackhi_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { lo = _mm512_unpacklo_epi16(self, other); hi = _mm512_unpackhi_epi16(self, other); } else { return zip_hi(self, other, avx512f {}); } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0), _mm512_extracti32x4_epi32(lo, 3), 2), _mm512_extracti32x4_epi32(hi, 2), 1); } // zip_lo template ::value, void>::type> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { lo = _mm512_unpacklo_epi8(self, other); hi = _mm512_unpackhi_epi8(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { lo = _mm512_unpacklo_epi16(self, other); hi = _mm512_unpackhi_epi16(self, other); } else { return zip_lo(self, other, avx512f {}); } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1), _mm512_extracti32x4_epi32(hi, 1), 3), _mm512_extracti32x4_epi32(lo, 1), 2); } } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx512cd.hpp000066400000000000000000000017231475736624100217010ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512CD_HPP #define XSIMD_AVX512CD_HPP #include "../types/xsimd_avx512cd_register.hpp" namespace xsimd { namespace kernel { // Nothing there yet. } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx512dq.hpp000066400000000000000000000222261475736624100217200ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512_DQHPP #define XSIMD_AVX512_D_HPP #include "../types/xsimd_avx512dq_register.hpp" namespace xsimd { namespace kernel { using namespace types; // bitwise_and template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_and_ps(self, other); } template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_and_pd(self, other); } // bitwise_andnot template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_andnot_ps(other, self); } template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_andnot_pd(other, self); } // bitwise_not template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1))); } template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1))); } // bitwise_or template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_or_ps(self, other); } template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_or_pd(self, other); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data | other.data); } // bitwise_xor template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_xor_ps(self, other); } template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_xor_pd(self, other); } // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { // The following folds over the vector once: // tmp1 = [a0..8, b0..8] // tmp2 = [a8..f, b8..f] #define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ batch res##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ res##I = _mm512_add_ps(tmp1, tmp2); \ } XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); #undef XSIMD_AVX512_HADDP_STEP1 // The following flds the code and shuffles so that hadd_ps produces the correct result // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ batch halfx##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx1 = _mm512_add_ps(tmp1, tmp2); \ \ auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx2 = _mm512_add_ps(tmp3, tmp4); \ \ auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx3 = _mm512_add_ps(tmp5, tmp6); \ \ halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \ _mm512_extractf32x8_ps(resx3, 1)); \ } XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); #undef XSIMD_AVX512_HADDP_STEP2 auto concat = _mm512_castps256_ps512(halfx0); concat = _mm512_insertf32x8(concat, halfx1, 1); return concat; } // ldexp template XSIMD_INLINE batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other)); } // mul template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mullo_epi64(self, other); } template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mullo_epi64(self, other); } // nearbyint_as_int template XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return _mm512_cvtpd_epi64(self); } // reduce_add template XSIMD_INLINE float reduce_add(batch const& rhs, requires_arch) noexcept { __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1); __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0); __m256 res1 = _mm256_add_ps(tmp1, tmp2); return reduce_add(batch(res1), avx2 {}); } // convert namespace detail { template XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { return _mm512_cvtepi64_pd(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvttpd_epi64(self); } } } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx512er.hpp000066400000000000000000000016451475736624100217240ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512ER_HPP #define XSIMD_AVX512ER_HPP #include #include #include "../types/xsimd_avx512er_register.hpp" #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx512f.hpp000066400000000000000000003256441475736624100215530ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512F_HPP #define XSIMD_AVX512F_HPP #include #include #include #include "../types/xsimd_avx512f_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fwd template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept; namespace detail { XSIMD_INLINE void split_avx512(__m512 val, __m256& low, __m256& high) noexcept { low = _mm512_castps512_ps256(val); high = _mm512_extractf32x8_ps(val, 1); } XSIMD_INLINE void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept { low = _mm512_castpd512_pd256(val); high = _mm512_extractf64x4_pd(val, 1); } XSIMD_INLINE void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept { low = _mm512_castsi512_si256(val); high = _mm512_extracti64x4_epi64(val, 1); } XSIMD_INLINE __m512i merge_avx(__m256i low, __m256i high) noexcept { return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1); } XSIMD_INLINE __m512 merge_avx(__m256 low, __m256 high) noexcept { return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_castps_pd(low)), _mm256_castps_pd(high), 1)); } XSIMD_INLINE __m512d merge_avx(__m256d low, __m256d high) noexcept { return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1); } template __m512i fwd_to_avx(F f, __m512i self) { __m256i self_low, self_high; split_avx512(self, self_low, self_high); __m256i res_low = f(self_low); __m256i res_high = f(self_high); return merge_avx(res_low, res_high); } template __m512i fwd_to_avx(F f, __m512i self, __m512i other) { __m256i self_low, self_high, other_low, other_high; split_avx512(self, self_low, self_high); split_avx512(other, other_low, other_high); __m256i res_low = f(self_low, other_low); __m256i res_high = f(self_high, other_high); return merge_avx(res_low, res_high); } template __m512i fwd_to_avx(F f, __m512i self, int32_t other) { __m256i self_low, self_high; split_avx512(self, self_low, self_high); __m256i res_low = f(self_low, other); __m256i res_high = f(self_high, other); return merge_avx(res_low, res_high); } } namespace detail { XSIMD_INLINE uint32_t morton(uint16_t x, uint16_t y) noexcept { static const unsigned short MortonTable256[256] = { 0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015, 0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055, 0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115, 0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155, 0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415, 0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455, 0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515, 0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555, 0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015, 0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055, 0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115, 0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155, 0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415, 0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455, 0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515, 0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555, 0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015, 0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055, 0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115, 0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155, 0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415, 0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455, 0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515, 0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555, 0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015, 0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055, 0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115, 0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155, 0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415, 0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455, 0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515, 0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555 }; uint32_t z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF]; return z; } template XSIMD_INLINE batch_bool compare_int_avx512f(batch const& self, batch const& other) noexcept { using register_type = typename batch_bool::register_type; if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { // shifting to take sign into account uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x000000FF)) << 24, (batch(other.data) & batch(0x000000FF)) << 24, Cmp); uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FF00)) << 16, (batch(other.data) & batch(0x0000FF00)) << 16, Cmp); uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x00FF0000)) << 8, (batch(other.data) & batch(0x00FF0000)) << 8, Cmp); uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); uint64_t mask = 0; for (unsigned i = 0; i < 16; ++i) { mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); } return (register_type)mask; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { // shifting to take sign into account uint16_t mask_low = _mm512_cmp_epi32_mask((batch(self.data) & batch(0x0000FFFF)) << 16, (batch(other.data) & batch(0x0000FFFF)) << 16, Cmp); uint16_t mask_high = _mm512_cmp_epi32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); return static_cast(morton(mask_low, mask_high)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x000000FF)), (batch(other.data) & batch(0x000000FF)), Cmp); uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FF00)), (batch(other.data) & batch(0x0000FF00)), Cmp); uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x00FF0000)), (batch(other.data) & batch(0x00FF0000)), Cmp); uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFF000000)), (batch(other.data) & batch(0xFF000000)), Cmp); uint64_t mask = 0; for (unsigned i = 0; i < 16; ++i) { mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0); mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1); mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2); mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3); } return (register_type)mask; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { uint16_t mask_low = _mm512_cmp_epu32_mask((batch(self.data) & batch(0x0000FFFF)), (batch(other.data) & batch(0x0000FFFF)), Cmp); uint16_t mask_high = _mm512_cmp_epu32_mask((batch(self.data) & batch(0xFFFF0000)), (batch(other.data) & batch(0xFFFF0000)), Cmp); return static_cast(morton(mask_low, mask_high)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp); } } } } // abs template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { __m512 self_asf = (__m512)self; __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf); __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi); return *reinterpret_cast<__m512*>(&res_asi); } template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { __m512d self_asd = (__m512d)self; __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd); __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), self_asi); return *reinterpret_cast<__m512d*>(&res_asi); } template ::value, void>::type> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { if (std::is_unsigned::value) { return self; } XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return detail::fwd_to_avx([](__m256i s) noexcept { return abs(batch(s)); }, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s) noexcept { return abs(batch(s)); }, self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_abs_epi32(self); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_abs_epi64(self); } else { assert(false && "unsupported arch/op combination"); return {}; } } // add template ::value, void>::type> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return add(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return add(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_add_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_add_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_add_ps(self, other); } template XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_add_pd(self, other); } // all template XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return self.data == register_type(-1); } // any template XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return self.data != register_type(0); } // batch_bool_cast template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return self.data; } // bitwise_and template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { #if defined(_MSC_VER) return _mm512_and_ps(self, other); #else return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); #endif } template XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); } template ::value, void>::type> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_and_si512(self, other); } template XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data & other.data); } // bitwise_andnot template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self))); } template XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self))); } template ::value, void>::type> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_andnot_si512(other, self); } template XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data & ~other.data); } // bitwise_lshift template ::value, void>::type> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); #else __m512i tmp = _mm512_slli_epi32(self, other); #endif return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept { return bitwise_lshift(batch(s), o, avx2 {}); }, self, other); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_sllv_epi32(self, _mm512_set1_epi32(other)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_sllv_epi64(self, _mm512_set1_epi64(other)); #else } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_slli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_slli_epi64(self, other); #endif } else { assert(false && "unsupported arch/op combination"); return {}; } } // bitwise_not template ::value, void>::type> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_xor_si512(self, _mm512_set1_epi32(-1)); } template XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(~self.data); } template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_set1_epi32(-1))); } template XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_set1_epi32(-1))); } // bitwise_or template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); } template XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); } template XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data | other.data); } template ::value, void>::type> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_or_si512(self, other); } // bitwise_rshift template ::value, void>::type> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srav_epi32(self, _mm512_set1_epi32(other)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srav_epi64(self, _mm512_set1_epi64(other)); #else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srai_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srai_epi64(self, other); #endif } else { return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept { return bitwise_rshift(batch(s), o, avx2 {}); }, self, other); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); #else __m512i tmp = _mm512_srli_epi32(self, other); #endif return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp); #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY) } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srlv_epi32(self, _mm512_set1_epi32(other)); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srlv_epi64(self, _mm512_set1_epi64(other)); #else } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_srli_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_srli_epi64(self, other); #endif } else { return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept { return bitwise_rshift(batch(s), o, avx2 {}); }, self, other); } } } // bitwise_xor template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other))); } template XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other))); } template XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data | other.data); } template ::value, void>::type> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_xor_si512(self, other); } // bitwise_cast template ::value, void>::type> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castsi512_ps(self); } template ::value, void>::type> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castsi512_pd(self); } template ::type>::value, void>::type> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return batch(self.data); } template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castps_pd(self); } template ::value, void>::type> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castps_si512(self); } template XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castpd_ps(self); } template ::value, void>::type> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_castpd_si512(self); } // broadcast template ::value, void>::type> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return _mm512_set1_epi8(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return _mm512_set1_epi16(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_set1_epi32(val); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_set1_epi64(val); } else { assert(false && "unsupported"); return {}; } } template XSIMD_INLINE batch broadcast(float val, requires_arch) noexcept { return _mm512_set1_ps(val); } template batch XSIMD_INLINE broadcast(double val, requires_arch) noexcept { return _mm512_set1_pd(val); } // ceil template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF); } template XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept { return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF); } // compress template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_ps(mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_pd(mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi32(mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi32(mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi64(mask.mask(), self); } template XSIMD_INLINE batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_compress_epi64(mask.mask(), self); } // convert namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvtepi32_ps(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvttps_epi32(self); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm512_cvtepu32_ps(self); } template batch fast_cast(batch const& self, batch const&, requires_arch) { return _mm512_cvttps_epu32(self); } } namespace detail { // complex_low template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); return _mm512_permutex2var_ps(self.real(), idx, self.imag()); } template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11); return _mm512_permutex2var_pd(self.real(), idx, self.imag()); } // complex_high template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); return _mm512_permutex2var_ps(self.real(), idx, self.imag()); } template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15); return _mm512_permutex2var_pd(self.real(), idx, self.imag()); } } // div template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_div_ps(self, other); } template XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_div_pd(self, other); } // eq template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ); } template XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ); } template ::value, void>::type> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } template XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(~self.data ^ other.data); } // expand template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_ps(mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_pd(mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi32(mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi32(mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi64(mask.mask(), self); } template XSIMD_INLINE batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept { return _mm512_maskz_expand_epi64(mask.mask(), self); } // floor template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF); } template XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept { return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF); } // fnma template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fnmadd_ps(x, y, z); } template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fnmadd_pd(x, y, z); } // fma template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmadd_ps(x, y, z); } template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmadd_pd(x, y, z); } // fms template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmsub_ps(x, y, z); } template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm512_fmsub_pd(x, y, z); } // from bool template XSIMD_INLINE batch from_bool(batch_bool const& self, requires_arch) noexcept { return select(self, batch(1), batch(0)); } // from_mask template XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { return static_cast::register_type>(mask); } // gather template = 0, detail::enable_sized_integral_t = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i32gather_epi32(index, static_cast(src), sizeof(T)); } template = 0, detail::enable_sized_integral_t = 0> XSIMD_INLINE batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i64gather_epi64(index, static_cast(src), sizeof(T)); } template = 0> XSIMD_INLINE batch gather(batch const&, float const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i32gather_ps(index, src, sizeof(float)); } template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, kernel::requires_arch) noexcept { return _mm512_i64gather_pd(index, src, sizeof(double)); } // gather: handmade conversions template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double))); const batch high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double))); return detail::merge_avx(_mm512_cvtpd_ps(low.data), _mm512_cvtpd_ps(high.data)); } template = 0> XSIMD_INLINE batch gather(batch const&, double const* src, batch const& index, requires_arch) noexcept { const batch low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double))); const batch high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double))); return detail::merge_avx(_mm512_cvtpd_epi32(low.data), _mm512_cvtpd_epi32(high.data)); } // ge template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ); } template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ); } template ::value, void>::type> XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } // gt template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ); } template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ); } template ::value, void>::type> XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { // The following folds over the vector once: // tmp1 = [a0..8, b0..8] // tmp2 = [a8..f, b8..f] #define XSIMD_AVX512_HADDP_STEP1(I, a, b) \ batch res##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ res##I = _mm512_add_ps(tmp1, tmp2); \ } XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]); XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]); XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]); XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]); XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]); XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]); XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]); XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]); #undef XSIMD_AVX512_HADDP_STEP1 // The following flds the code and shuffles so that hadd_ps produces the correct result // tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3) // tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4) // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ... #define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \ batch halfx##I; \ { \ auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx1 = _mm512_add_ps(tmp1, tmp2); \ \ auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx2 = _mm512_add_ps(tmp3, tmp4); \ \ auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \ auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \ \ auto resx3 = _mm512_add_ps(tmp5, tmp6); \ \ halfx##I = _mm256_hadd_ps(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 0)), _mm512_extractf32x4_ps(resx3, 1), 1), \ _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 2)), _mm512_extractf32x4_ps(resx3, 3), 1)); \ } XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3); XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7); #undef XSIMD_AVX512_HADDP_STEP2 auto concat = _mm512_castps256_ps512(halfx0); concat = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(concat), _mm256_castps_pd(halfx1), 1)); return concat; } template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { #define step1(I, a, b) \ batch res##I; \ { \ auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \ auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \ res##I = _mm512_add_pd(tmp1, tmp2); \ } step1(1, row[0], row[2]); step1(2, row[4], row[6]); step1(3, row[1], row[3]); step1(4, row[5], row[7]); #undef step1 auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0)); auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1)); auto resx1 = _mm512_add_pd(tmp5, tmp6); auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0)); auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1)); auto resx2 = _mm512_add_pd(tmp7, tmp8); auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000); auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111); return _mm512_add_pd(tmpx, tmpy); } // isnan template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q); } template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q); } // ldexp template XSIMD_INLINE batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { return _mm512_scalef_ps(self, _mm512_cvtepi32_ps(other)); } template XSIMD_INLINE batch ldexp(const batch& self, const batch, A>& other, requires_arch) noexcept { // FIXME: potential data loss here when converting other elements to // int32 before converting them back to double. __m512d adjusted_index = _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(other)); return _mm512_scalef_pd(self, adjusted_index); } // le template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ); } template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ); } template ::value, void>::type> XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } // load_aligned template ::value, void>::type> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept { return _mm512_load_si512((__m512i const*)mem); } template XSIMD_INLINE batch load_aligned(float const* mem, convert, requires_arch) noexcept { return _mm512_load_ps(mem); } template XSIMD_INLINE batch load_aligned(double const* mem, convert, requires_arch) noexcept { return _mm512_load_pd(mem); } // load_complex namespace detail { template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); auto real = _mm512_permutex2var_ps(hi, real_idx, lo); auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo); return { real, imag }; } template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14); __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15); auto real = _mm512_permutex2var_pd(hi, real_idx, lo); auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo); return { real, imag }; } } // load_unaligned template ::value, void>::type> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { return _mm512_loadu_si512((__m512i const*)mem); } template XSIMD_INLINE batch load_unaligned(float const* mem, convert, requires_arch) noexcept { return _mm512_loadu_ps(mem); } template XSIMD_INLINE batch load_unaligned(double const* mem, convert, requires_arch) noexcept { return _mm512_loadu_pd(mem); } // lt template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ); } template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ); } template ::value, void>::type> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return detail::compare_int_avx512f(self, other); } // mask template XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return self.data; } // max template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_max_ps(self, other); } template XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_max_pd(self, other); } template ::value, void>::type> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_max_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_max_epi64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return max(batch(s), batch(o)); }, self, other); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_max_epu32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_max_epu64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return max(batch(s), batch(o)); }, self, other); } } } // min template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_min_ps(self, other); } template XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_min_pd(self, other); } template ::value, void>::type> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_min_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_min_epi64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return min(batch(s), batch(o)); }, self, other); } } else { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_min_epu32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_min_epu64(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return min(batch(s), batch(o)); }, self, other); } } } // mul template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mul_ps(self, other); } template XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_mul_pd(self, other); } template ::value, void>::type> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_mullo_epi32(self, other); } else { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return mul(batch(s), batch(o)); }, self, other); } } // nearbyint template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); } template XSIMD_INLINE batch nearbyint(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION); } // nearbyint_as_int template XSIMD_INLINE batch nearbyint_as_int(batch const& self, requires_arch) noexcept { return _mm512_cvtps_epi32(self); } // neg template XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return 0 - self; } // neq template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_UQ); } template XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_UQ); } template ::value, void>::type> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~(self == other); } template XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { using register_type = typename batch_bool::register_type; return register_type(self.data ^ other.data); } // reciprocal template XSIMD_INLINE batch reciprocal(batch const& self, kernel::requires_arch) noexcept { return _mm512_rcp14_ps(self); } template XSIMD_INLINE batch reciprocal(batch const& self, kernel::requires_arch) noexcept { return _mm512_rcp14_pd(self); } // reduce_add template XSIMD_INLINE float reduce_add(batch const& rhs, requires_arch) noexcept { __m128 tmp1 = _mm512_extractf32x4_ps(rhs, 0); __m128 tmp2 = _mm512_extractf32x4_ps(rhs, 1); __m128 tmp3 = _mm512_extractf32x4_ps(rhs, 2); __m128 tmp4 = _mm512_extractf32x4_ps(rhs, 3); __m128 res1 = _mm_add_ps(tmp1, tmp2); __m128 res2 = _mm_add_ps(tmp3, tmp4); __m128 res3 = _mm_add_ps(res1, res2); return reduce_add(batch(res3), sse4_2 {}); } template XSIMD_INLINE double reduce_add(batch const& rhs, requires_arch) noexcept { __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1); __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0); __m256d res1 = _mm256_add_pd(tmp1, tmp2); return reduce_add(batch(res1), avx2 {}); } template ::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { __m256i low, high; detail::split_avx512(self, low, high); batch blow(low), bhigh(high); return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {}); } // reduce_max template ::type> XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept { constexpr batch_constant mask; batch step = _mm512_permutexvar_epi64(mask.as_batch(), self); batch acc = max(self, step); __m256i low = _mm512_castsi512_si256(acc); return reduce_max(batch(low)); } // reduce_min template ::type> XSIMD_INLINE T reduce_min(batch const& self, requires_arch) noexcept { constexpr batch_constant mask; batch step = _mm512_permutexvar_epi64(mask.as_batch(), self); batch acc = min(self, step); __m256i low = _mm512_castsi512_si256(acc); return reduce_min(batch(low)); } // rsqrt template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm512_rsqrt14_ps(val); } template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm512_rsqrt14_pd(val); } // sadd template ::value, void>::type> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { auto mask = other < 0; auto self_pos_branch = min(std::numeric_limits::max() - other, self); auto self_neg_branch = max(std::numeric_limits::min() - other, self); return other + select(mask, self_neg_branch, self_pos_branch); } else { const auto diffmax = std::numeric_limits::max() - self; const auto mindiff = min(diffmax, other); return self + mindiff; } } // scatter template ::value || std::is_same::value, void>::type> XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i32scatter_epi32(dst, index, src, sizeof(T)); } template ::value || std::is_same::value, void>::type> XSIMD_INLINE void scatter(batch const& src, T* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i64scatter_epi64(dst, index, src, sizeof(T)); } template XSIMD_INLINE void scatter(batch const& src, float* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i32scatter_ps(dst, index, src, sizeof(float)); } template XSIMD_INLINE void scatter(batch const& src, double* dst, batch const& index, kernel::requires_arch) noexcept { _mm512_i64scatter_pd(dst, index, src, sizeof(double)); } // select template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm512_mask_blend_ps(cond, false_br, true_br); } template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm512_mask_blend_pd(cond, false_br, true_br); } template ::value, void>::type> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { alignas(avx2::alignment()) uint8_t buffer[64]; // FIXME: ultra inefficient for (int i = 0; i < 64; ++i) buffer[i] = cond.data & (1ull << i) ? 0xFF : 0; __m256i cond_low = batch::load_aligned(&buffer[0]); __m256i cond_hi = batch::load_aligned(&buffer[32]); __m256i true_low, true_hi; detail::split_avx512(true_br, true_low, true_hi); __m256i false_low, false_hi; detail::split_avx512(false_br, false_low, false_hi); __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {}); return detail::merge_avx(res_low, res_hi); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0)); __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0)); __m256i true_low, true_hi; detail::split_avx512(true_br, true_low, true_hi); __m256i false_low, false_hi; detail::split_avx512(false_br, false_low, false_hi); __m256i res_low = select(batch_bool(cond_low), batch(true_low), batch(false_low), avx2 {}); __m256i res_hi = select(batch_bool(cond_hi), batch(true_hi), batch(false_hi), avx2 {}); return detail::merge_avx(res_low, res_hi); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_mask_blend_epi32(cond, false_br, true_br); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_mask_blend_epi64(cond, false_br, true_br); } else { assert(false && "unsupported arch/type combination"); return {}; } } template ::value, void>::type> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, avx512f {}); } namespace detail { template using enable_signed_integer_t = typename std::enable_if::value && std::is_signed::value, int>::type; template using enable_unsigned_integer_t = typename std::enable_if::value && std::is_unsigned::value, int>::type; } // set template XSIMD_INLINE batch set(batch const&, requires_arch, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept { return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template XSIMD_INLINE batch set(batch const&, requires_arch, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept { return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); } template ::value, void>::type> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept { return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0); } template ::value, void>::type> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept { return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template = 0> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v32hi) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 }; #else return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); #endif } template = 0> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v32hu) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31 }; #else return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31); #endif } template = 0> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v64qi) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 }; #else return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); #endif } template = 0> XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15, T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31, T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39, T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47, T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55, T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept { #if defined(__clang__) || __GNUC__ return __extension__(__m512i)(__v64qu) { v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63 }; #else return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63); #endif } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); using register_type = typename batch_bool::register_type; register_type r = 0; unsigned shift = 0; (void)std::initializer_list { (r |= register_type(values ? 1 : 0) << (shift++))... }; return r; } // shuffle template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = (I0 & 0x3) | ((I1 & 0x3) << 2) | ((I2 & 0x3) << 4) | ((I3 & 0x3) << 6); // shuffle within lane if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I0 < 4 && I1 < 4 && I2 >= 16 && I2 < 20 && I3 >= 16 && I3 < 20) return _mm512_shuffle_ps(x, y, smask); // shuffle within opposite lane if ((I4 == I0 + 4) && (I5 == I1 + 4) && (I6 == I2 + 4) && (I7 == I3 + 4) && (I8 == I0 + 8) && (I9 == I1 + 8) && (I10 == I2 + 8) && (I11 == I3 + 8) && (I12 == I0 + 12) && (I13 == I1 + 12) && (I14 == I2 + 12) && (I15 == I3 + 12) && I2 < 4 && I3 < 4 && I0 >= 16 && I0 < 20 && I1 >= 16 && I1 < 20) return _mm512_shuffle_ps(y, x, smask); return shuffle(x, y, mask, generic {}); } template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3) | ((I4 & 0x1) << 4) | ((I5 & 0x1) << 5) | ((I6 & 0x1) << 6) | ((I7 & 0x1) << 7); // shuffle within lane if (I0 < 2 && I1 >= 8 && I1 < 10 && I2 >= 2 && I2 < 4 && I3 >= 10 && I3 < 12 && I4 >= 4 && I4 < 6 && I5 >= 12 && I5 < 14 && I6 >= 6 && I6 < 8 && I7 >= 14) return _mm512_shuffle_pd(x, y, smask); // shuffle within opposite lane if (I1 < 2 && I0 >= 8 && I0 < 10 && I3 >= 2 && I3 < 4 && I2 >= 10 && I2 < 12 && I5 >= 4 && I5 < 6 && I4 >= 12 && I4 < 14 && I7 >= 6 && I7 < 8 && I6 >= 14) return _mm512_shuffle_pd(y, x, smask); return shuffle(x, y, mask, generic {}); } // slide_left template XSIMD_INLINE batch slide_left(batch const&, requires_arch) noexcept { static_assert(N == 0xDEAD, "not implemented yet"); return {}; } // slide_right template XSIMD_INLINE batch slide_right(batch const&, requires_arch) noexcept { static_assert(N == 0xDEAD, "not implemented yet"); return {}; } // sqrt template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm512_sqrt_ps(val); } template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm512_sqrt_pd(val); } // ssub template ::value, void>::type> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { return sadd(self, -other); } else { const auto diff = min(self, other); return self - diff; } } // store template XSIMD_INLINE void store(batch_bool const& self, bool* mem, requires_arch) noexcept { using register_type = typename batch_bool::register_type; constexpr auto size = batch_bool::size; for (std::size_t i = 0; i < size; ++i) mem[i] = self.data & (register_type(1) << i); } // store_aligned template ::value, void>::type> XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept { return _mm512_store_si512((__m512i*)mem, self); } template ::value, void>::type> XSIMD_INLINE void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm512_store_si512((__m512i*)mem, self); } template XSIMD_INLINE void store_aligned(float* mem, batch const& self, requires_arch) noexcept { return _mm512_store_ps(mem, self); } template XSIMD_INLINE void store_aligned(double* mem, batch const& self, requires_arch) noexcept { return _mm512_store_pd(mem, self); } // store_unaligned template ::value, void>::type> XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { return _mm512_storeu_si512((__m512i*)mem, self); } template ::value, void>::type> XSIMD_INLINE void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept { return _mm512_storeu_si512((__m512i*)mem, self); } template XSIMD_INLINE void store_unaligned(float* mem, batch const& self, requires_arch) noexcept { return _mm512_storeu_ps(mem, self); } template XSIMD_INLINE void store_unaligned(double* mem, batch const& self, requires_arch) noexcept { return _mm512_storeu_pd(mem, self); } // sub template ::value, void>::type> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return sub(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept { return sub(batch(s), batch(o)); }, self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { return _mm512_sub_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { return _mm512_sub_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } } template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_sub_ps(self, other); } template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { return _mm512_sub_pd(self, other); } // swizzle (dynamic version) template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_ps(mask, self); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_pd(mask, self); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_epi64(mask, self); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return _mm512_permutexvar_epi32(mask, self); } template XSIMD_INLINE batch swizzle(batch const& self, batch mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); } // swizzle (constant version) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512f {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512f {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512f {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512f {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512f {}); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return swizzle(self, mask.as_batch(), avx512f {}); } namespace detail { template struct is_pair_of_contiguous_indices; template struct is_pair_of_contiguous_indices : std::true_type { }; template struct is_pair_of_contiguous_indices : std::conditional<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices, std::false_type>::type { }; template struct fold_batch_constant { using type = batch_constant; }; } template ::value, void>::type> XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr typename detail::fold_batch_constant::type mask32; return _mm512_permutexvar_epi32(static_cast>(mask32), self); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { // FIXME: this sequence is very inefficient, but it's here to catch // a pattern generated by detail::reduce from xsimd_generic_math.hpp. // The whole pattern is actually decently folded by GCC and Clang, // so bare with it. constexpr batch_constant mask32; auto tmp = _mm512_permutexvar_epi32(static_cast>(mask32), self); alignas(A::alignment()) uint16_t buffer[32]; _mm512_store_si512((__m512i*)&buffer[0], tmp); buffer[0] = buffer[1]; return _mm512_load_si512(&buffer[0]); } template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { return bitwise_cast(swizzle(bitwise_cast(self), mask, avx512f {})); } // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; batch tmp_lo0[16]; for (int i = 0; i < 16; ++i) tmp_lo0[i] = _mm512_castsi512_si256(matrix_begin[i]); transpose(tmp_lo0 + 0, tmp_lo0 + 16, avx2 {}); batch tmp_hi0[16]; for (int i = 0; i < 16; ++i) tmp_hi0[i] = _mm512_castsi512_si256(matrix_begin[16 + i]); transpose(tmp_hi0 + 0, tmp_hi0 + 16, avx2 {}); batch tmp_lo1[16]; for (int i = 0; i < 16; ++i) tmp_lo1[i] = _mm512_extracti64x4_epi64(matrix_begin[i], 1); transpose(tmp_lo1 + 0, tmp_lo1 + 16, avx2 {}); batch tmp_hi1[16]; for (int i = 0; i < 16; ++i) tmp_hi1[i] = _mm512_extracti64x4_epi64(matrix_begin[16 + i], 1); transpose(tmp_hi1 + 0, tmp_hi1 + 16, avx2 {}); for (int i = 0; i < 16; ++i) matrix_begin[i] = detail::merge_avx(tmp_lo0[i], tmp_hi0[i]); for (int i = 0; i < 16; ++i) matrix_begin[i + 16] = detail::merge_avx(tmp_lo1[i], tmp_hi1[i]); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; batch tmp_lo0[32]; for (int i = 0; i < 32; ++i) tmp_lo0[i] = _mm512_castsi512_si256(matrix_begin[i]); transpose(tmp_lo0 + 0, tmp_lo0 + 32, avx2 {}); batch tmp_hi0[32]; for (int i = 0; i < 32; ++i) tmp_hi0[i] = _mm512_castsi512_si256(matrix_begin[32 + i]); transpose(tmp_hi0 + 0, tmp_hi0 + 32, avx2 {}); batch tmp_lo1[32]; for (int i = 0; i < 32; ++i) tmp_lo1[i] = _mm512_extracti64x4_epi64(matrix_begin[i], 1); transpose(tmp_lo1 + 0, tmp_lo1 + 32, avx2 {}); batch tmp_hi1[32]; for (int i = 0; i < 32; ++i) tmp_hi1[i] = _mm512_extracti64x4_epi64(matrix_begin[32 + i], 1); transpose(tmp_hi1 + 0, tmp_hi1 + 32, avx2 {}); for (int i = 0; i < 32; ++i) matrix_begin[i] = detail::merge_avx(tmp_lo0[i], tmp_hi0[i]); for (int i = 0; i < 32; ++i) matrix_begin[i + 32] = detail::merge_avx(tmp_lo1[i], tmp_hi1[i]); } template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { return transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } // trunc template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); } template XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept { return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION); } // zip_hi template ::value, void>::type> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { lo = _mm512_unpacklo_epi32(self, other); hi = _mm512_unpackhi_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { lo = _mm512_unpacklo_epi64(self, other); hi = _mm512_unpackhi_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0), _mm512_extracti32x4_epi32(lo, 3), 2), _mm512_extracti32x4_epi32(hi, 2), 1); } template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_unpacklo_ps(self, other); auto hi = _mm512_unpackhi_ps(self, other); return _mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0), _mm512_extractf32x4_ps(lo, 3), 2), _mm512_extractf32x4_ps(hi, 2), 1); } template XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other)); auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other)); return _mm512_castps_pd(_mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0), _mm512_extractf32x4_ps(lo, 3), 2), _mm512_extractf32x4_ps(hi, 2), 1)); } // zip_lo template ::value, void>::type> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { __m512i lo, hi; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(false && "not implemented yet"); return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { lo = _mm512_unpacklo_epi32(self, other); hi = _mm512_unpackhi_epi32(self, other); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { lo = _mm512_unpacklo_epi64(self, other); hi = _mm512_unpackhi_epi64(self, other); } else { assert(false && "unsupported arch/op combination"); return {}; } return _mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1), _mm512_extracti32x4_epi32(hi, 1), 3), _mm512_extracti32x4_epi32(lo, 1), 2); } template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_unpacklo_ps(self, other); auto hi = _mm512_unpackhi_ps(self, other); return _mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1), _mm512_extractf32x4_ps(hi, 1), 3), _mm512_extractf32x4_ps(lo, 1), 2); } template XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other)); auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other)); return _mm512_castps_pd(_mm512_insertf32x4( _mm512_insertf32x4( _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1), _mm512_extractf32x4_ps(hi, 1), 3), _mm512_extractf32x4_ps(lo, 1), 2)); } } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx512ifma.hpp000066400000000000000000000016531475736624100222310ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VBMI_HPP #define XSIMD_AVX512VBMI_HPP #include #include #include "../types/xsimd_avx512vbmi_register.hpp" #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx512pf.hpp000066400000000000000000000016451475736624100217230ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512PF_HPP #define XSIMD_AVX512PF_HPP #include #include #include "../types/xsimd_avx512pf_register.hpp" #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx512vbmi.hpp000066400000000000000000000016531475736624100222520ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VBMI_HPP #define XSIMD_AVX512VBMI_HPP #include #include #include "../types/xsimd_avx512vbmi_register.hpp" #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp000066400000000000000000000017101475736624100237200ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VNNI_AVX512_BW_HPP #define XSIMD_AVX512VNNI_AVX512_BW_HPP #include #include #include "../types/xsimd_avx512vnni_avx512bw_register.hpp" #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp000066400000000000000000000017141475736624100242510ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVX512VNNI_AVX512VBMI_HPP #define XSIMD_AVX512VNNI_AVX512VBMI_HPP #include #include #include "../types/xsimd_avx512vnni_avx512vbmi_register.hpp" #endif xsimd-13.2.0/include/xsimd/arch/xsimd_avxvnni.hpp000066400000000000000000000016421475736624100220350ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_AVXVNNI_HPP #define XSIMD_AVXVNNI_HPP #include #include #include "../types/xsimd_avxvnni_register.hpp" #endif xsimd-13.2.0/include/xsimd/arch/xsimd_constants.hpp000066400000000000000000000334141475736624100223620ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NUMERICAL_CONSTANT_HPP #define XSIMD_NUMERICAL_CONSTANT_HPP #include #include "../types/xsimd_utils.hpp" namespace xsimd { namespace constants { #define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \ template \ XSIMD_INLINE T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ XSIMD_INLINE float NAME() noexcept \ { \ return SINGLE; \ } \ template <> \ XSIMD_INLINE double NAME() noexcept \ { \ return DOUBLE; \ } #define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \ template \ XSIMD_INLINE T NAME() noexcept \ { \ return T(NAME()); \ } \ template <> \ XSIMD_INLINE float NAME() noexcept \ { \ return bit_cast((uint32_t)SINGLE); \ } \ template <> \ XSIMD_INLINE double NAME() noexcept \ { \ return bit_cast((uint64_t)DOUBLE); \ } // Under fast-math, GCC might replace signmask (minus zero) by zero #if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__) #pragma GCC push_options #pragma GCC optimize("signed-zeros") #endif XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits::infinity()), (std::numeric_limits::infinity())) XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986) XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000) XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200) XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949) XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883) XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553) XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000) XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76) XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000) XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312) XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12) XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd) XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5) XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0) XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400) XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.) XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167) XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18) XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641) XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.) XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167) XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity()), (-infinity())) XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff) XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000) XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000) XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07) XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07) XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000) XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331) XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000) XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073) XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000) XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1) XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18) XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000) XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits::min(), std::numeric_limits::min()) XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704) XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000) XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31) XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6) XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e) XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883) XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0) XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286) #if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__) #pragma GCC pop_options #endif #undef XSIMD_DEFINE_CONSTANT #undef XSIMD_DEFINE_CONSTANT_HEX template constexpr T allbits() noexcept; template constexpr as_integer_t mask1frexp() noexcept; template constexpr as_integer_t mask2frexp() noexcept; template constexpr as_integer_t maxexponent() noexcept; template constexpr as_integer_t maxexponentm1() noexcept; template constexpr int32_t nmb() noexcept; template constexpr T zero() noexcept; template constexpr T minvalue() noexcept; template constexpr T maxvalue() noexcept; /************************** * allbits implementation * **************************/ namespace detail { template ::value> struct allbits_impl { static constexpr T get_value() noexcept { return T(~0); } }; template struct allbits_impl { static constexpr T get_value() noexcept { return nan(); } }; } template XSIMD_INLINE constexpr T allbits() noexcept { return T(detail::allbits_impl::get_value()); } /***************************** * mask1frexp implementation * *****************************/ template XSIMD_INLINE constexpr as_integer_t mask1frexp() noexcept { return as_integer_t(mask1frexp()); } template <> XSIMD_INLINE constexpr int32_t mask1frexp() noexcept { return 0x7f800000; } template <> XSIMD_INLINE constexpr int64_t mask1frexp() noexcept { return 0x7ff0000000000000; } /***************************** * mask2frexp implementation * *****************************/ template XSIMD_INLINE constexpr as_integer_t mask2frexp() noexcept { return as_integer_t(mask2frexp()); } template <> XSIMD_INLINE constexpr int32_t mask2frexp() noexcept { return 0x3f000000; } template <> XSIMD_INLINE constexpr int64_t mask2frexp() noexcept { return 0x3fe0000000000000; } /****************************** * maxexponent implementation * ******************************/ template XSIMD_INLINE constexpr as_integer_t maxexponent() noexcept { return as_integer_t(maxexponent()); } template <> XSIMD_INLINE constexpr int32_t maxexponent() noexcept { return 127; } template <> XSIMD_INLINE constexpr int64_t maxexponent() noexcept { return 1023; } /****************************** * maxexponent implementation * ******************************/ template XSIMD_INLINE constexpr as_integer_t maxexponentm1() noexcept { return as_integer_t(maxexponentm1()); } template <> XSIMD_INLINE constexpr int32_t maxexponentm1() noexcept { return 126; } template <> XSIMD_INLINE constexpr int64_t maxexponentm1() noexcept { return 1022; } /********************** * nmb implementation * **********************/ template XSIMD_INLINE constexpr int32_t nmb() noexcept { return nmb(); } template <> XSIMD_INLINE constexpr int32_t nmb() noexcept { return 23; } template <> XSIMD_INLINE constexpr int32_t nmb() noexcept { return 52; } /*********************** * zero implementation * ***********************/ template XSIMD_INLINE constexpr T zero() noexcept { return T(typename T::value_type(0)); } /*************************** * minvalue implementation * ***************************/ namespace detail { template struct minvalue_impl { static constexpr T get_value() noexcept { return std::numeric_limits::min(); } }; template struct minvalue_common { static constexpr T get_value() noexcept { return std::numeric_limits::min(); } }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl : minvalue_common { }; template <> struct minvalue_impl { XSIMD_INLINE static float get_value() noexcept { return bit_cast((uint32_t)0xff7fffff); } }; template <> struct minvalue_impl { XSIMD_INLINE static double get_value() noexcept { return bit_cast((uint64_t)0xffefffffffffffff); } }; } template constexpr T minvalue() noexcept { return T(detail::minvalue_impl::get_value()); } /*************************** * maxvalue implementation * ***************************/ template constexpr T maxvalue() noexcept { return T(std::numeric_limits::max()); } } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_emulated.hpp000066400000000000000000001065561475736624100221560ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_EMULATED_HPP #define XSIMD_EMULATED_HPP #include #include #include #include #include "../arch/xsimd_scalar.hpp" #include "../types/xsimd_emulated_register.hpp" #include "../types/xsimd_utils.hpp" namespace xsimd { template struct batch_bool_constant; template XSIMD_INLINE batch bitwise_cast(batch const& x) noexcept; template struct batch_constant; namespace kernel { using namespace types; // fwd template XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept; template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept; namespace detail { template auto emulated_apply(F func, Bs const&... bs) -> decltype(func(bs.data[I]...)) { return func(bs.data[I]...); } template auto emulated_apply(F func, ::xsimd::detail::index_sequence, B const& b, Bs const&... bs) -> std::array { return { emulated_apply(func, b, bs...)... }; } template auto emulated_apply(F func, B const& b, Bs const&... bs) -> std::array { return emulated_apply(func, ::xsimd::detail::make_index_sequence(), b, bs...); } } // abs template ::size> XSIMD_INLINE batch abs(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::abs(v); }, self); } // add template ::size> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::add(v0, v1); }, self, other); } // all template ::size> XSIMD_INLINE bool all(batch_bool const& self, requires_arch>) noexcept { return std::all_of(self.data.begin(), self.data.end(), [](T v) { return bool(v); }); } // any template ::size> XSIMD_INLINE bool any(batch_bool const& self, requires_arch>) noexcept { return std::any_of(self.data.begin(), self.data.end(), [](T v) { return bool(v); }); } // batch_bool_cast template ::size> XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch>) noexcept { return { self.data }; } // bitwise_and template ::size> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::bitwise_and(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::bitwise_and(v0, v1); }, self, other); } // bitwise_andnot template ::size> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::bitwise_andnot(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::bitwise_andnot(v0, v1); }, self, other); } // bitwise_lshift template ::size> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch>) noexcept { return detail::emulated_apply([other](T v) { return xsimd::bitwise_lshift(v, other); }, self); } // bitwise_not template ::size> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::bitwise_not(v); }, self); } template ::size> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch>) noexcept { return detail::emulated_apply([](bool v) { return xsimd::bitwise_not(v); }, self); } // bitwise_or template ::size> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::bitwise_or(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::bitwise_or(v0, v1); }, self, other); } // bitwise_rshift template ::size> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch>) noexcept { return detail::emulated_apply([other](T v) { return xsimd::bitwise_rshift(v, other); }, self); } // bitwise_xor template ::size> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::bitwise_xor(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::bitwise_xor(v0, v1); }, self, other); } // bitwise_cast template ::size> XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array result; char* raw_data = reinterpret_cast(result.data()); const char* raw_input = reinterpret_cast(self.data.data()); memcpy(raw_data, raw_input, size * sizeof(T_out)); return result; } // broadcast template ::size> batch XSIMD_INLINE broadcast(T val, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array r; std::fill(r.begin(), r.end(), val); return r; } #if 0 // count template ::size> XSIMD_INLINE size_t count(batch_bool const& x, requires_arch>) noexcept { uint64_t m = x.mask(); // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel m = m - ((m >> 1) & (uint64_t) ~(uint64_t)0 / 3); // temp m = (m & (uint64_t) ~(uint64_t)0 / 15 * 3) + ((m >> 2) & (uint64_t) ~(uint64_t)0 / 15 * 3); // temp m = (m + (m >> 4)) & (uint64_t) ~(uint64_t)0 / 255 * 15; // temp return (m * ((uint64_t) ~(uint64_t)0 / 255)) >> (sizeof(uint64_t) - 1) * CHAR_BIT; // count } #endif // store_complex namespace detail { // complex_low template ::size> XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array result; for (size_t i = 0; i < size / 2; ++i) { result[2 * i] = self.real().data[i]; result[1 + 2 * i] = self.imag().data[i]; } return result; } // complex_high template ::size> XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array result; for (size_t i = 0; i < size / 2; ++i) { result[2 * i] = self.real().data[i + size / 2]; result[1 + 2 * i] = self.imag().data[i + size / 2]; } return result; } } // decr_if template ::size> XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch>) noexcept { return self - batch(mask.data); } // div template ::size> XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::div(v0, v1); }, self, other); } // fast_cast namespace detail { template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](int32_t v) { return float(v); }, self); } template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](uint32_t v) { return float(v); }, self); } template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](int64_t v) { return double(v); }, self); } template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](uint64_t v) { return double(v); }, self); } template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](float v) { return int32_t(v); }, self); } template ::size> XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch>) noexcept { return detail::emulated_apply([](double v) { return int64_t(v); }, self); } } // eq template ::size> XSIMD_INLINE batch_bool> eq(batch> const& self, batch> const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::eq(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool> eq(batch_bool> const& self, batch_bool> const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::eq(v0, v1); }, self, other); } // from_bool template ::size> XSIMD_INLINE batch from_bool(batch_bool const& self, requires_arch>) noexcept { return detail::emulated_apply([](bool v) { return T(v); }, self); } // from_mask template ::size> XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array vmask; for (size_t i = 0; i < size; ++i) vmask[i] = (mask >> i) & 1u; return vmask; } // ge template ::size> XSIMD_INLINE batch_bool> ge(batch> const& self, batch> const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::ge(v0, v1); }, self, other); } // gt template ::size> XSIMD_INLINE batch_bool> gt(batch> const& self, batch> const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::gt(v0, v1); }, self, other); } // haddp template ::size> XSIMD_INLINE batch haddp(batch const* row, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array r; for (size_t i = 0; i < size; ++i) r[i] = std::accumulate(row[i].data.begin() + 1, row[i].data.end(), row[i].data.front()); return r; } // incr_if template ::size> XSIMD_INLINE batch incr_if(batch const& self, batch_bool const& mask, requires_arch>) noexcept { return self + batch(mask.data); } // insert template ::size> XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch>) noexcept { batch other = self; other.data[I] = val; return other; } // isnan template ::size, class = typename std::enable_if::value, void>::type> XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::isnan(v); }, self); } // load_aligned template ::size> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array res; std::copy(mem, mem + size, res.begin()); return res; } // load_unaligned template ::size> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array res; std::copy(mem, mem + size, res.begin()); return res; } // load_complex namespace detail { template ::size> XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array real, imag; for (size_t i = 0; i < size / 2; ++i) { real[i] = hi.data[2 * i]; imag[i] = hi.data[1 + 2 * i]; } for (size_t i = 0; i < size / 2; ++i) { real[size / 2 + i] = lo.data[2 * i]; imag[size / 2 + i] = lo.data[1 + 2 * i]; } return { real, imag }; } } // le template ::size> XSIMD_INLINE batch_bool> le(batch> const& self, batch> const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::le(v0, v1); }, self, other); } // lt template ::size> XSIMD_INLINE batch_bool> lt(batch> const& self, batch> const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::lt(v0, v1); }, self, other); } // mask template ::size> XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch>) noexcept { constexpr size_t size = batch::size; uint64_t res = 0; for (size_t i = 0; i < size; ++i) res |= (self.data[i] ? 1u : 0u) << i; return res; } // max template ::size> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::max(v0, v1); }, self, other); } // min template ::size> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::min(v0, v1); }, self, other); } // mul template ::size> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::mul(v0, v1); }, self, other); } // nearbyint_as_int template ::size> XSIMD_INLINE batch, A> nearbyint_as_int(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::nearbyint_as_int(v); }, self); } // neg template ::size> XSIMD_INLINE batch neg(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::neg(v); }, self); } // neq template ::size> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::neq(v0, v1); }, self, other); } template ::size> XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch>) noexcept { return detail::emulated_apply([](bool v0, bool v1) { return xsimd::neq(v0, v1); }, self, other); } // reduce_add template ::size> XSIMD_INLINE T reduce_add(batch const& self, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array buffer; self.store_unaligned(buffer.data()); return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin()); } // reduce_max template ::size> XSIMD_INLINE T reduce_max(batch const& self, requires_arch>) noexcept { return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y) { return xsimd::max(x, y); }); } // reduce_min template ::size> XSIMD_INLINE T reduce_min(batch const& self, requires_arch>) noexcept { return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y) { return xsimd::min(x, y); }); } // rsqrt template ::size> XSIMD_INLINE batch rsqrt(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::rsqrt(v); }, self); } // select template ::size> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch>) noexcept { return detail::emulated_apply([](bool c, T t, T f) { return xsimd::select(c, t, f); }, cond, true_br, false_br); } template XSIMD_INLINE batch select(batch_bool_constant const& cond, batch const& true_br, batch const& false_br, requires_arch::size>>) noexcept { constexpr size_t size = batch::size; static_assert(sizeof...(Values) == size, "consistent init"); return select((batch_bool)cond, true_br, false_br, emulated<8 * sizeof(T) * size> {}); } // shuffle template XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch::size>>) noexcept { constexpr size_t size = batch::size; batch bmask = mask; std::array res; for (size_t i = 0; i < size; ++i) res[i] = bmask.data[i] < size ? x.data[bmask.data[i]] : y.data[bmask.data[i] - size]; return res; } // sqrt template ::size> XSIMD_INLINE batch sqrt(batch const& self, requires_arch>) noexcept { return detail::emulated_apply([](T v) { return xsimd::sqrt(v); }, self); } // slide_left template ::size> XSIMD_INLINE batch slide_left(batch const& x, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array result; char* raw_data = reinterpret_cast(result.data()); memset(raw_data, 0, M); memcpy(raw_data + M, reinterpret_cast(x.data.data()), sizeof(T) * result.size() - M); return result; } // slide_right template ::size> XSIMD_INLINE batch slide_right(batch const& x, requires_arch>) noexcept { constexpr size_t size = batch::size; std::array result; char* raw_data = reinterpret_cast(result.data()); memcpy(raw_data, reinterpret_cast(x.data.data()) + M, sizeof(T) * result.size() - M); memset(raw_data + sizeof(T) * result.size() - M, 0, M); return result; } // sadd template ::size> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::sadd(v0, v1); }, self, other); } // set template XSIMD_INLINE batch> set(batch> const&, requires_arch>, Values... values) noexcept { static_assert(sizeof...(Values) == batch>::size, "consistent init"); return { typename batch>::register_type { static_cast(values)... } }; } template XSIMD_INLINE batch_bool> set(batch_bool> const&, requires_arch>, Values... values) noexcept { static_assert(sizeof...(Values) == batch>::size, "consistent init"); return { std::array { static_cast(values)... } }; } // ssub template ::size> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::ssub(v0, v1); }, self, other); } // store_aligned template XSIMD_INLINE void store_aligned(T* mem, batch> const& self, requires_arch>) noexcept { std::copy(self.data.begin(), self.data.end(), mem); } // store_unaligned template XSIMD_INLINE void store_unaligned(T* mem, batch> const& self, requires_arch>) noexcept { std::copy(self.data.begin(), self.data.end(), mem); } // sub template ::size> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch>) noexcept { return detail::emulated_apply([](T v0, T v1) { return xsimd::sub(v0, v1); }, self, other); } // swizzle template XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch::size>>) noexcept { constexpr size_t size = batch::size; batch bmask = mask; std::array res; for (size_t i = 0; i < size; ++i) res[i] = self.data[bmask.data[i]]; return res; } // zip_hi template ::size> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch>) noexcept { constexpr size_t size = batch::size; // Note: irregular behavior for odd numbers. std::array res; if (size % 2) { for (size_t i = 0; i < size; ++i) res[i] = (i % 2 ? self : other).data[size / 2 + i / 2]; } else { for (size_t i = 0; i < size; ++i) res[i] = (i % 2 ? other : self).data[size / 2 + i / 2]; } return res; } // zip_lo template ::size> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch>) noexcept { constexpr size_t size = batch::size; // Note: irregular behavior for odd numbers. std::array res; for (size_t i = 0; i < size; ++i) res[i] = (i % 2 ? other : self).data[i / 2]; return res; } } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_fma3_avx.hpp000066400000000000000000000057421475736624100220550ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_AVX_HPP #define XSIMD_FMA3_AVX_HPP #include "../types/xsimd_fma3_avx_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fnma template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmadd_ps(x, y, z); } template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmadd_pd(x, y, z); } // fnms template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmsub_ps(x, y, z); } template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fnmsub_pd(x, y, z); } // fma template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmadd_ps(x, y, z); } template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmadd_pd(x, y, z); } // fms template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmsub_ps(x, y, z); } template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm256_fmsub_pd(x, y, z); } } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_fma3_avx2.hpp000066400000000000000000000030701475736624100221270ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_AVX2_HPP #define XSIMD_FMA3_AVX2_HPP #include "../types/xsimd_fma3_avx2_register.hpp" // Allow inclusion of xsimd_fma3_avx.hpp #ifdef XSIMD_FMA3_AVX_HPP #undef XSIMD_FMA3_AVX_HPP #define XSIMD_FORCE_FMA3_AVX_HPP #endif // Disallow inclusion of ./xsimd_fma3_avx_register.hpp #ifndef XSIMD_FMA3_AVX_REGISTER_HPP #define XSIMD_FMA3_AVX_REGISTER_HPP #define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP #endif // Include ./xsimd_fma3_avx.hpp but s/avx/avx2 #define avx avx2 #include "./xsimd_fma3_avx.hpp" #undef avx #undef XSIMD_FMA3_AVX_HPP // Carefully restore guards #ifdef XSIMD_FORCE_FMA3_AVX_HPP #define XSIMD_FMA3_AVX_HPP #undef XSIMD_FORCE_FMA3_AVX_HPP #endif #ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP #undef XSIMD_FMA3_AVX_REGISTER_HPP #undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP #endif #endif xsimd-13.2.0/include/xsimd/arch/xsimd_fma3_sse.hpp000066400000000000000000000057411475736624100220500ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA3_SSE_HPP #define XSIMD_FMA3_SSE_HPP #include "../types/xsimd_fma3_sse_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fnma template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmadd_ps(x, y, z); } template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmadd_pd(x, y, z); } // fnms template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmsub_ps(x, y, z); } template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fnmsub_pd(x, y, z); } // fma template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmadd_ps(x, y, z); } template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmadd_pd(x, y, z); } // fms template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmsub_ps(x, y, z); } template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch>) noexcept { return _mm_fmsub_pd(x, y, z); } } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_fma4.hpp000066400000000000000000000056151475736624100211770ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_FMA4_HPP #define XSIMD_FMA4_HPP #include "../types/xsimd_fma4_register.hpp" namespace xsimd { namespace kernel { using namespace types; // fnma template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_nmacc_ps(x, y, z); } template XSIMD_INLINE batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_nmacc_pd(x, y, z); } // fnms template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_nmsub_ps(x, y, z); } template XSIMD_INLINE batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_nmsub_pd(x, y, z); } // fma template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_macc_ps(x, y, z); } template XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_macc_pd(x, y, z); } // fms template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_msub_ps(x, y, z); } template XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept { return _mm_msub_pd(x, y, z); } } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_generic.hpp000066400000000000000000000022211475736624100217520ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_HPP #define XSIMD_GENERIC_HPP #include "./generic/xsimd_generic_arithmetic.hpp" #include "./generic/xsimd_generic_complex.hpp" #include "./generic/xsimd_generic_logical.hpp" #include "./generic/xsimd_generic_math.hpp" #include "./generic/xsimd_generic_memory.hpp" #include "./generic/xsimd_generic_rounding.hpp" #include "./generic/xsimd_generic_trigo.hpp" #endif xsimd-13.2.0/include/xsimd/arch/xsimd_generic_fwd.hpp000066400000000000000000000052731475736624100226240ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_GENERIC_FWD_HPP #define XSIMD_GENERIC_FWD_HPP #include "../types/xsimd_batch_constant.hpp" #include namespace xsimd { namespace kernel { // forward declaration template ::value, void>::type> XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept; template ::value, void>::type> XSIMD_INLINE batch bitwise_lshift(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> XSIMD_INLINE batch bitwise_rshift(batch const& self, batch const& other, requires_arch) noexcept; template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept; template ::value, void>::type> XSIMD_INLINE T hadd(batch const& self, requires_arch) noexcept; } } #endif xsimd-13.2.0/include/xsimd/arch/xsimd_i8mm_neon64.hpp000066400000000000000000000016051475736624100224060ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_I8MM_NEON64_HPP #define XSIMD_I8MM_NEON64_HPP #include "../types/xsimd_i8mm_neon64_register.hpp" #endif xsimd-13.2.0/include/xsimd/arch/xsimd_isa.hpp000066400000000000000000000052121475736624100211150ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_ISA_HPP #define XSIMD_ISA_HPP #include "../config/xsimd_arch.hpp" #include "./xsimd_generic_fwd.hpp" #if XSIMD_WITH_EMULATED #include "./xsimd_emulated.hpp" #endif #if XSIMD_WITH_SSE2 #include "./xsimd_sse2.hpp" #endif #if XSIMD_WITH_SSE3 #include "./xsimd_sse3.hpp" #endif #if XSIMD_WITH_SSSE3 #include "./xsimd_ssse3.hpp" #endif #if XSIMD_WITH_SSE4_1 #include "./xsimd_sse4_1.hpp" #endif #if XSIMD_WITH_SSE4_2 #include "./xsimd_sse4_2.hpp" #endif #if XSIMD_WITH_FMA3_SSE #include "./xsimd_fma3_sse.hpp" #endif #if XSIMD_WITH_FMA4 #include "./xsimd_fma4.hpp" #endif #if XSIMD_WITH_AVX #include "./xsimd_avx.hpp" #endif #if XSIMD_WITH_FMA3_AVX #include "./xsimd_fma3_avx.hpp" #endif #if XSIMD_WITH_AVXVNNI #include "./xsimd_avxvnni.hpp" #endif #if XSIMD_WITH_AVX2 #include "./xsimd_avx2.hpp" #endif #if XSIMD_WITH_FMA3_AVX2 #include "./xsimd_fma3_avx2.hpp" #endif #if XSIMD_WITH_AVX512F #include "./xsimd_avx512f.hpp" #endif #if XSIMD_WITH_AVX512BW #include "./xsimd_avx512bw.hpp" #endif #if XSIMD_WITH_AVX512ER #include "./xsimd_avx512er.hpp" #endif #if XSIMD_WITH_AVX512PF #include "./xsimd_avx512pf.hpp" #endif #if XSIMD_WITH_AVX512IFMA #include "./xsimd_avx512ifma.hpp" #endif #if XSIMD_WITH_AVX512VBMI #include "./xsimd_avx512vbmi.hpp" #endif #if XSIMD_WITH_AVX512VNNI_AVX512BW #include "./xsimd_avx512vnni_avx512bw.hpp" #endif #if XSIMD_WITH_AVX512VNNI_AVX512VBMI #include "./xsimd_avx512vnni_avx512vbmi.hpp" #endif #if XSIMD_WITH_NEON #include "./xsimd_neon.hpp" #endif #if XSIMD_WITH_NEON64 #include "./xsimd_neon64.hpp" #endif #if XSIMD_WITH_I8MM_NEON64 #include "./xsimd_i8mm_neon64.hpp" #endif #if XSIMD_WITH_SVE #include "./xsimd_sve.hpp" #endif #if XSIMD_WITH_RVV #include "./xsimd_rvv.hpp" #endif #if XSIMD_WITH_WASM #include "./xsimd_wasm.hpp" #endif // Must come last to have access to all conversion specializations. #include "./xsimd_generic.hpp" #endif xsimd-13.2.0/include/xsimd/arch/xsimd_neon.hpp000066400000000000000000003673651475736624100213240ustar00rootroot00000000000000/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * * Copyright (c) Serge Guelton * * * * Distributed under the terms of the BSD 3-Clause License. * * * * The full license is in the file LICENSE, distributed with this software. * ****************************************************************************/ #ifndef XSIMD_NEON_HPP #define XSIMD_NEON_HPP #include #include #include #include #include "../types/xsimd_neon_register.hpp" #include "../types/xsimd_utils.hpp" // Wrap intrinsics so we can pass them as function pointers // - OP: intrinsics name prefix, e.g., vorrq // - RT: type traits to deduce intrinsics return types #define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \ namespace wrap \ { \ XSIMD_INLINE RT OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \ { \ return ::OP##_u8(a, b); \ } \ XSIMD_INLINE RT OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \ { \ return ::OP##_u16(a, b); \ } \ XSIMD_INLINE RT OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \ { \ return ::OP##_u32(a, b); \ } \ } #define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \ namespace wrap \ { \ XSIMD_INLINE RT OP##_s8(int8x16_t a, int8x16_t b) noexcept \ { \ return ::OP##_s8(a, b); \ } \ XSIMD_INLINE RT OP##_s16(int16x8_t a, int16x8_t b) noexcept \ { \ return ::OP##_s16(a, b); \ } \ XSIMD_INLINE RT OP##_s32(int32x4_t a, int32x4_t b) noexcept \ { \ return ::OP##_s32(a, b); \ } \ } #define WRAP_BINARY_INT(OP, RT) \ WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ namespace wrap \ { \ XSIMD_INLINE RT OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \ { \ return ::OP##_u64(a, b); \ } \ XSIMD_INLINE RT OP##_s64(int64x2_t a, int64x2_t b) noexcept \ { \ return ::OP##_s64(a, b); \ } \ } #define WRAP_BINARY_FLOAT(OP, RT) \ namespace wrap \ { \ XSIMD_INLINE RT OP##_f32(float32x4_t a, float32x4_t b) noexcept \ { \ return ::OP##_f32(a, b); \ } \ } #define WRAP_UNARY_INT_EXCLUDING_64(OP) \ namespace wrap \ { \ XSIMD_INLINE uint8x16_t OP##_u8(uint8x16_t a) noexcept \ { \ return ::OP##_u8(a); \ } \ XSIMD_INLINE int8x16_t OP##_s8(int8x16_t a) noexcept \ { \ return ::OP##_s8(a); \ } \ XSIMD_INLINE uint16x8_t OP##_u16(uint16x8_t a) noexcept \ { \ return ::OP##_u16(a); \ } \ XSIMD_INLINE int16x8_t OP##_s16(int16x8_t a) noexcept \ { \ return ::OP##_s16(a); \ } \ XSIMD_INLINE uint32x4_t OP##_u32(uint32x4_t a) noexcept \ { \ return ::OP##_u32(a); \ } \ XSIMD_INLINE int32x4_t OP##_s32(int32x4_t a) noexcept \ { \ return ::OP##_s32(a); \ } \ } #define WRAP_UNARY_INT(OP) \ WRAP_UNARY_INT_EXCLUDING_64(OP) \ namespace wrap \ { \ XSIMD_INLINE uint64x2_t OP##_u64(uint64x2_t a) noexcept \ { \ return ::OP##_u64(a); \ } \ XSIMD_INLINE int64x2_t OP##_s64(int64x2_t a) noexcept \ { \ return ::OP##_s64(a); \ } \ } #define WRAP_UNARY_FLOAT(OP) \ namespace wrap \ { \ XSIMD_INLINE float32x4_t OP##_f32(float32x4_t a) noexcept \ { \ return ::OP##_f32(a); \ } \ } // Dummy identity caster to ease coding XSIMD_INLINE uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; } XSIMD_INLINE int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; } XSIMD_INLINE uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; } XSIMD_INLINE int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; } XSIMD_INLINE uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; } XSIMD_INLINE int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; } XSIMD_INLINE uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; } XSIMD_INLINE int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; } XSIMD_INLINE float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; } namespace xsimd { template struct batch_bool_constant; namespace kernel { using namespace types; namespace detail { template